From 0435abb37b8b6499e71ed92dcaf1534082bd220a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 5 Jul 2024 13:22:05 +0100 Subject: [PATCH 01/28] Replace LLMBlock model_prompt param with model_family In preparation for custom pipeline configuration files, do not require model_prompt as an LLMBlock param - it can have built-in knowledge of the correct prompt to use per model_family. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/default_flows.py | 47 +++++++++------------------- src/instructlab/sdg/generate_data.py | 3 +- src/instructlab/sdg/llmblock.py | 26 ++++++++++++--- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 818c4972..dd3e781e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -10,23 +10,6 @@ from .llmblock import LLMBlock from .utilblocks import CombineColumnsBlock -MODEL_FAMILY_MIXTRAL = "mixtral" -MODEL_FAMILY_MERLINITE = "merlinite" - -_MODEL_PROMPT_MIXTRAL = " [INST] {prompt} [/INST]" -_MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'" - -_MODEL_PROMPTS = { - MODEL_FAMILY_MIXTRAL: _MODEL_PROMPT_MIXTRAL, - MODEL_FAMILY_MERLINITE: _MODEL_PROMPT_MERLINITE, -} - - -def _get_model_prompt(model_family): - if model_family not in _MODEL_PROMPTS: - raise ValueError(f"Unknown model family: {model_family}") - return _MODEL_PROMPTS[model_family] - class Flow(ABC): def __init__( @@ -53,7 +36,7 @@ def get_flow(self) -> list: "config_path": "", # must be set by subclass "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["output"], }, "gen_kwargs": { @@ -110,7 +93,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["mmlubench_question", "mmlubench_answer"], }, "gen_kwargs": { @@ -135,7 +118,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["question", "response"], "parser_kwargs": { "parser_name": "custom", @@ -157,7 +140,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["explanation", "judgment"], }, "gen_kwargs": { @@ -186,7 +169,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["feedback", "score"], }, "gen_kwargs": { @@ -216,7 +199,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["explanation", "rating"], }, "gen_kwargs": { @@ -253,7 +236,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["question"], "batch_kwargs": { "num_samples": self.num_instructions_to_generate, @@ -271,7 +254,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -299,7 +282,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["response"], }, }, @@ -313,7 +296,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -347,7 +330,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["context"], }, "gen_kwargs": { @@ -367,7 +350,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["question"], "batch_kwargs": { "num_samples": 3, @@ -385,7 +368,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -413,7 +396,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["response"], }, }, @@ -427,7 +410,7 @@ def get_flow(self) -> list: ), "client": self.client, "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), + "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 36c6cad4..89a3ae5b 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -18,8 +18,6 @@ # pylint: disable=ungrouped-imports from instructlab.sdg import SDG, utils from instructlab.sdg.default_flows import ( - MODEL_FAMILY_MERLINITE, - MODEL_FAMILY_MIXTRAL, MMLUBenchFlow, SimpleFreeformSkillFlow, SimpleGroundedSkillFlow, @@ -28,6 +26,7 @@ SynthKnowledgeFlow, SynthSkillsFlow, ) +from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL from instructlab.sdg.pipeline import Pipeline from instructlab.sdg.utils import models from instructlab.sdg.utils.taxonomy import ( diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 4153a191..ad21dd68 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -13,6 +13,23 @@ logger = setup_logger(__name__) +MODEL_FAMILY_MIXTRAL = "mixtral" +MODEL_FAMILY_MERLINITE = "merlinite" + +_MODEL_PROMPT_MIXTRAL = " [INST] {prompt} [/INST]" +_MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'" + +_MODEL_PROMPTS = { + MODEL_FAMILY_MIXTRAL: _MODEL_PROMPT_MIXTRAL, + MODEL_FAMILY_MERLINITE: _MODEL_PROMPT_MERLINITE, +} + + +def _get_model_prompt(model_family): + if model_family not in _MODEL_PROMPTS: + raise ValueError(f"Unknown model family: {model_family}") + return _MODEL_PROMPTS[model_family] + def server_supports_batched(client, model_id: str) -> bool: supported = getattr(client, "server_supports_batched", None) @@ -42,9 +59,9 @@ def __init__( config_path, client, model_id, + model_family, output_cols, parser_kwargs={}, - model_prompt="{prompt}", **batch_kwargs, ) -> None: super().__init__(block_name) @@ -55,7 +72,8 @@ def __init__( self.prompt_template = self.prompt_struct.format(**self.block_config) self.client = client self.model = model_id - self.model_prompt = model_prompt + self.model_family = model_family + self.model_prompt = _get_model_prompt(self.model_family) self.output_cols = output_cols self.batch_params = batch_kwargs.get("batch_kwargs", {}) self.parser_name = parser_kwargs.get("parser_name", None) @@ -193,10 +211,10 @@ def __init__( config_paths, client, model_id, + model_family, output_cols, selector_column_name, parser_kwargs={}, - model_prompt="{prompt}", **batch_kwargs, ) -> None: super().__init__( @@ -204,9 +222,9 @@ def __init__( config_paths[0][0], client, model_id, + model_family, output_cols, parser_kwargs=parser_kwargs, - model_prompt=model_prompt, **batch_kwargs, ) self.selector_column_name = selector_column_name From 49c87d57cdf390264fd1bfce3d41c99935c12b40 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 2 Jul 2024 13:47:20 +0100 Subject: [PATCH 02/28] Add a PipelineContext class In order to prepare for pipeline definitions in YAML, remove runtime parameters like the OpenAI client, model ID, and model family from the pipeline definition into a PipelineContext object that all blocks have access to. Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 6 +- scripts/test_grounded_skills.py | 6 +- scripts/test_knowledge.py | 13 ++-- src/instructlab/sdg/block.py | 3 +- src/instructlab/sdg/default_flows.py | 100 +++++++-------------------- src/instructlab/sdg/filterblock.py | 11 ++- src/instructlab/sdg/generate_data.py | 48 +++++-------- src/instructlab/sdg/llmblock.py | 31 ++++----- src/instructlab/sdg/pipeline.py | 20 +++++- src/instructlab/sdg/utilblocks.py | 16 +++-- tests/test_filterblock.py | 3 + 11 files changed, 112 insertions(+), 145 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index a8612c09..058fd64f 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.default_flows import SynthSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +from src.instructlab.sdg.pipeline import Pipeline, PipelineContext # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" @@ -49,7 +49,9 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() +ctx = PipelineContext(client, "mixtral", teacher_model, 1) + +skills_flow = SynthSkillsFlow(ctx).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 338edb6c..6d0bdc1b 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline +from src.instructlab.sdg.pipeline import Pipeline, PipelineContext # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" @@ -97,7 +97,9 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() +ctx = PipelineContext(client, "mixtral", teacher_model, 10) + +skills_flow = SynthGroundedSkillsFlow(ctx).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aeedcf59..2b534903 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -8,7 +8,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow -from src.instructlab.sdg.pipeline import Pipeline +from src.instructlab.sdg.pipeline import Pipeline, PipelineContext # Please don't add you vLLM endpoint key here openai_api_key = "EMPTY" @@ -38,12 +38,13 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() -knowledge_pipe = Pipeline(knowledge_flow) -mmlu_pipe = Pipeline(mmlu_flow) +ctx = PipelineContext(client, "mixtral", teacher_model, 1) -sdg = SDG([mmlu_pipe, knowledge_pipe]) +mmlu_flow = MMLUBenchFlow(ctx).get_flow() +knowledge_flow = SynthKnowledgeFlow(ctx).get_flow() +knowledge_pipe = Pipeline(mmlu_flow + knowledge_flow) + +sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) print(mmlubench_data) diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index 09433f55..e8807420 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -14,7 +14,8 @@ class Block(ABC): - def __init__(self, block_name: str) -> None: + def __init__(self, ctx, block_name: str) -> None: + self.ctx = ctx self.block_name = block_name @staticmethod diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index dd3e781e..ab6396d2 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard from abc import ABC, abstractmethod -from importlib import resources import operator import os @@ -12,14 +11,8 @@ class Flow(ABC): - def __init__( - self, client, model_family, model_id, num_instructions_to_generate - ) -> None: - self.client = client - self.model_family = model_family - self.model_id = model_id - self.num_instructions_to_generate = num_instructions_to_generate - self.sdg_base = resources.files(__package__) + def __init__(self, ctx) -> None: + self.ctx = ctx @abstractmethod def get_flow(self) -> list: @@ -34,15 +27,12 @@ def get_flow(self) -> list: "block_config": { "block_name": "", # must be set by subclass "config_path": "", # must be set by subclass - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["output"], }, "gen_kwargs": { "max_tokens": 2048, "temperature": 0.7, - "n": self.num_instructions_to_generate, + "n": self.ctx.num_instructions_to_generate, }, "drop_duplicates": ["output"], } @@ -53,7 +43,7 @@ class SimpleKnowledgeFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() flow[0]["block_config"]["config_path"] = os.path.join( - self.sdg_base, "configs/knowledge/simple_generate_qa.yaml" + self.ctx.sdg_base, "configs/knowledge/simple_generate_qa.yaml" ) flow[0]["block_config"]["block_name"] = "gen_knowledge" return flow @@ -63,10 +53,9 @@ class SimpleFreeformSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() flow[0]["block_config"]["config_path"] = os.path.join( - self.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" + self.ctx.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" ) flow[0]["block_config"]["block_name"] = "gen_skill_freeform" - flow[0]["block_config"]["block_name"] = "gen_skill_freeform" return flow @@ -74,7 +63,7 @@ class SimpleGroundedSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() flow[0]["block_config"]["config_path"] = os.path.join( - self.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" + self.ctx.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" ) flow[0]["block_config"]["block_name"] = "gen_skill_grounded" return flow @@ -82,18 +71,14 @@ def get_flow(self) -> list: class MMLUBenchFlow(Flow): def get_flow(self) -> list: - self.sdg_base = resources.files(__package__) return [ { "block_type": LLMBlock, "block_config": { "block_name": "gen_mmlu_knowledge", "config_path": os.path.join( - self.sdg_base, "configs/knowledge/mcq_generation.yaml" + self.ctx.sdg_base, "configs/knowledge/mcq_generation.yaml" ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["mmlubench_question", "mmlubench_answer"], }, "gen_kwargs": { @@ -113,12 +98,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_knowledge", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/knowledge/generate_questions_responses.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["question", "response"], "parser_kwargs": { "parser_name": "custom", @@ -136,11 +118,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "eval_faithfulness_qa_pair", "config_path": os.path.join( - self.sdg_base, "configs/knowledge/evaluate_faithfulness.yaml" + self.ctx.sdg_base, + "configs/knowledge/evaluate_faithfulness.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["explanation", "judgment"], }, "gen_kwargs": { @@ -165,11 +145,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "eval_relevancy_qa_pair", "config_path": os.path.join( - self.sdg_base, "configs/knowledge/evaluate_relevancy.yaml" + self.ctx.sdg_base, + "configs/knowledge/evaluate_relevancy.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["feedback", "score"], }, "gen_kwargs": { @@ -195,11 +173,8 @@ def get_flow(self) -> list: "block_config": { "block_name": "eval_verify_question", "config_path": os.path.join( - self.sdg_base, "configs/knowledge/evaluate_question.yaml" + self.ctx.sdg_base, "configs/knowledge/evaluate_question.yaml" ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["explanation", "rating"], }, "gen_kwargs": { @@ -231,15 +206,12 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_questions", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/freeform_questions.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["question"], "batch_kwargs": { - "num_samples": self.num_instructions_to_generate, + "num_samples": self.ctx.num_instructions_to_generate, }, }, "drop_duplicates": ["question"], @@ -249,12 +221,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "eval_questions", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/evaluate_freeform_questions.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -277,12 +246,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_responses", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/freeform_responses.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["response"], }, }, @@ -291,12 +257,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "evaluate_qa_pair", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/evaluate_freeform_pair.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -325,18 +288,15 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_contexts", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/contexts.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["context"], }, "gen_kwargs": { "temperature": 0.7, "max_tokens": 2048, - "n": self.num_instructions_to_generate, + "n": self.ctx.num_instructions_to_generate, }, "drop_duplicates": ["context"], }, @@ -345,12 +305,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_grounded_questions", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/grounded_questions.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["question"], "batch_kwargs": { "num_samples": 3, @@ -363,12 +320,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "eval_grounded_questions", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/evaluate_grounded_questions.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, @@ -391,12 +345,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "gen_grounded_responses", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/grounded_responses.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["response"], }, }, @@ -405,12 +356,9 @@ def get_flow(self) -> list: "block_config": { "block_name": "evaluate_grounded_qa_pair", "config_path": os.path.join( - self.sdg_base, + self.ctx.sdg_base, "configs/skills/evaluate_grounded_pair.yaml", ), - "client": self.client, - "model_id": self.model_id, - "model_family": self.model_family, "output_cols": ["evaluation", "score"], }, }, diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index f5551b02..609ce142 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -11,12 +11,19 @@ class FilterByValueBlock(Block): def __init__( - self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs + self, + ctx, + filter_column, + filter_value, + operation, + convert_dtype=None, + **batch_kwargs, ) -> None: """ Initializes a new instance of the FilterByValueBlock class. Parameters: + - ctx (PipelineContext): A PipelineContext object containing runtime parameters. - filter_column (str): The name of the column in the dataset to apply the filter on. - filter_value (any or list of any): The value(s) to filter by. - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset. @@ -26,7 +33,7 @@ def __init__( Returns: None """ - super().__init__(block_name=self.__class__.__name__) + super().__init__(ctx, block_name=self.__class__.__name__) self.value = filter_value if isinstance(filter_value, list) else [filter_value] self.column_name = filter_column self.operation = operation diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 89a3ae5b..abcd6665 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -27,7 +27,7 @@ SynthSkillsFlow, ) from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL -from instructlab.sdg.pipeline import Pipeline +from instructlab.sdg.pipeline import Pipeline, PipelineContext from instructlab.sdg.utils import models from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, @@ -183,37 +183,25 @@ def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_ge else: raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.") - sdg_knowledge = SDG( - [ - Pipeline( - flow_type( - client, model_family, model_name, num_instructions_to_generate - ).get_flow() - ) - for flow_type in knowledge_flow_types - ] - ) - sdg_freeform_skill = SDG( - [ - Pipeline( - flow_type( - client, model_family, model_name, num_instructions_to_generate - ).get_flow() - ) - for flow_type in freeform_skill_flow_types - ] + ctx = PipelineContext( + client, model_family, model_name, num_instructions_to_generate ) - sdg_grounded_skill = SDG( - [ - Pipeline( - flow_type( - client, model_family, model_name, num_instructions_to_generate - ).get_flow() - ) - for flow_type in grounded_skill_flow_types - ] + + def build_pipeline(flow_types): + block_configs = [] + for flow_type in flow_types: + block_configs.extend(flow_type(ctx).get_flow()) + return Pipeline(ctx, block_configs) + + knowledge_pipeline = build_pipeline(knowledge_flow_types) + freeform_skill_pipeline = build_pipeline(freeform_skill_flow_types) + grounded_skill_pipeline = build_pipeline(grounded_skill_flow_types) + + return ( + SDG([knowledge_pipeline]), + SDG([freeform_skill_pipeline]), + SDG([grounded_skill_pipeline]), ) - return sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill # TODO - parameter removal needs to be done in sync with a CLI change. diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ad21dd68..eaa58556 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -55,39 +55,36 @@ class LLMBlock(Block): # pylint: disable=too-many-instance-attributes def __init__( self, + ctx, block_name, config_path, - client, - model_id, - model_family, output_cols, parser_kwargs={}, **batch_kwargs, ) -> None: - super().__init__(block_name) + super().__init__(ctx, block_name) self.block_config = self._load_config(config_path) self.prompt_struct = ( """{system}\n{introduction}\n{principles}\n{examples}\n{generation}""" ) self.prompt_template = self.prompt_struct.format(**self.block_config) - self.client = client - self.model = model_id - self.model_family = model_family - self.model_prompt = _get_model_prompt(self.model_family) + self.model_prompt = _get_model_prompt(self.ctx.model_family) self.output_cols = output_cols self.batch_params = batch_kwargs.get("batch_kwargs", {}) self.parser_name = parser_kwargs.get("parser_name", None) self.parsing_pattern = parser_kwargs.get("parsing_pattern", None) self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None) self.defaults = { - "model": self.model, + "model": self.ctx.model_id, "temperature": 0, "max_tokens": 12000, } # Whether the LLM server supports a list of input prompts # and supports the n parameter to generate n outputs per input - self.server_supports_batched = server_supports_batched(client, model_id) + self.server_supports_batched = server_supports_batched( + self.ctx.client, self.ctx.model_id + ) def _parse(self, generated_string) -> dict: matches = {} @@ -137,14 +134,16 @@ def _generate(self, samples, **gen_kwargs) -> list: generate_args = {**self.defaults, **gen_kwargs} if self.server_supports_batched: - response = self.client.completions.create(prompt=prompts, **generate_args) + response = self.ctx.client.completions.create( + prompt=prompts, **generate_args + ) return [choice.text.strip() for choice in response.choices] n = gen_kwargs.get("n", 1) results = [] for prompt in prompts: for _ in range(n): - response = self.client.completions.create( + response = self.ctx.client.completions.create( prompt=prompt, **generate_args ) results.append(response.choices[0].text.strip()) @@ -207,22 +206,18 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: class ConditionalLLMBlock(LLMBlock): def __init__( self, + ctx, block_name, config_paths, - client, - model_id, - model_family, output_cols, selector_column_name, parser_kwargs={}, **batch_kwargs, ) -> None: super().__init__( + ctx, block_name, config_paths[0][0], - client, - model_id, - model_family, output_cols, parser_kwargs=parser_kwargs, **batch_kwargs, diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index bc570a83..93464601 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +from importlib import resources + # Third Party from datasets import Dataset @@ -8,12 +11,25 @@ logger = setup_logger(__name__) +class PipelineContext: + def __init__( + self, client, model_family, model_id, num_instructions_to_generate + ) -> None: + self.client = client + self.model_family = model_family + self.model_id = model_id + self.num_instructions_to_generate = num_instructions_to_generate + self.sdg_base = resources.files(__package__) + + class Pipeline: - def __init__(self, chained_blocks: list) -> None: + def __init__(self, ctx, chained_blocks: list) -> None: """ Initialize the Pipeline class with a configuration dictionary. config_dict: the run config py or yaml loaded into a dictionary """ + # ctx is a PipelineContext object that supplies context configuration to every block + self.ctx = ctx # pipeline config is the run configuration that consists of the pipeline steps self.chained_blocks = chained_blocks @@ -36,7 +52,7 @@ def generate(self, dataset) -> Dataset: drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(**block_config) + block = block_type(self.ctx, **block_config) logger.info("Running block: %s", block_config["block_name"]) logger.info(dataset) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index db04b5a1..a93a5742 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -10,9 +10,11 @@ class SamplePopulatorBlock(Block): - def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None: + def __init__( + self, ctx, config_paths, column_name, post_fix="", **batch_kwargs + ) -> None: super().__init__( - block_name=self.__class__.__name__ + ctx, block_name=self.__class__.__name__ ) # Call the base class's __init__ self.configs = {} for config in config_paths: @@ -35,8 +37,8 @@ def generate(self, samples) -> Dataset: class SelectorBlock(Block): - def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None: - super().__init__(block_name=self.__class__.__name__) + def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> None: + super().__init__(ctx, block_name=self.__class__.__name__) self.choice_map = choice_map self.choice_col = choice_col self.output_col = output_col @@ -52,8 +54,10 @@ def generate(self, samples: Dataset) -> Dataset: class CombineColumnsBlock(Block): - def __init__(self, columns, output_col, separator="\n\n", **batch_kwargs) -> None: - super().__init__(block_name=self.__class__.__name__) + def __init__( + self, ctx, columns, output_col, separator="\n\n", **batch_kwargs + ) -> None: + super().__init__(ctx, block_name=self.__class__.__name__) self.columns = columns self.output_col = output_col self.separator = separator diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py index 7b8b1ce7..53531fd0 100644 --- a/tests/test_filterblock.py +++ b/tests/test_filterblock.py @@ -8,17 +8,20 @@ # First Party from instructlab.sdg.filterblock import FilterByValueBlock +from instructlab.sdg.pipeline import PipelineContext class TestFilterByValueBlock(unittest.TestCase): def setUp(self): self.block = FilterByValueBlock( + PipelineContext(None, None, None, None), filter_column="age", filter_value=30, operation=operator.eq, convert_dtype=int, ) self.block_with_list = FilterByValueBlock( + PipelineContext(None, None, None, None), filter_column="age", filter_value=[30, 35], operation=operator.eq, From 7cfbaa9dc11a6ad338a5aabc133cb1a8736142a8 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 10 Jul 2024 13:13:06 -0400 Subject: [PATCH 03/28] Fix multiprocessing issues in FilterByValueBlock This addresses issues with using num_proc>1 with Dataset.map() and Dataset.filter(). The first issue is: ``` File "/usr/lib64/python3.11/pickle.py", line 578, in save rv = reduce(self.proto) ^^^^^^^^^^^^^^^^^^ TypeError: cannot pickle 'SSLContext' object ``` What was happening here is that the entire FilterByValueBlock object was being serialized to send to the multiprocessing worker. And now that this includes PipelineContext, which includes the OpenAI client object, which includes SSLContext, we hit a known issue: uqfoundation/dill#308 The second issue is specific to map(): ``` ValueError: The features can't be aligned because the key score of features {'task_description': Value(dtype='string', id=None), 'seed_question': Value(dtype='string', id=None), 'seed_response': Value(dtype='string', id=None), 'num_samples': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'evaluation': Value(dtype='string', id=None), 'score': Value(dtype='string', id=None)} has unexpected type - Value(dtype='string', id=None) (expected either Value(dtype='float64', id=None) or Value("null"). ``` It appears the the datasets, only in the case of num_proc>1, when we hit the "error converting dtype" case and set the column to None, it ends up being still considered a string column rather than the new dtype. This second issue deserves further investigation and may require a fix to the datasets library. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/filterblock.py | 55 ++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 609ce142..96d3e7af 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -9,6 +9,39 @@ logger = setup_logger(__name__) +# Note - this is not a method on the class below in order to avoid +# serializing the object itself when multi-processing is used. +# In particular, SSLContext - embedded in the OpenAI client object - +# cannot be pickled. +def _filter_by_values(samples, column, op, values, num_proc=1): + return samples.filter( + lambda x: any(op(x[column], value) for value in values), + num_proc=num_proc, + ) + + +def _map_dtype(samples, column, dtype, num_proc=1): + def convert_column(sample): + try: + sample[column] = dtype(sample[column]) + except ValueError as e: + logger.error( + "Error converting dtype: %s, filling with None to be filtered later", e + ) + sample[column] = None + return sample + + # FIXME: it appears multiprocessing map has issues with + # None columns. If we pass num_proc>1 here and the error + # case is triggered above, we get: + # ValueError: The features can't be aligned ... + # because the column is still considered a string not + # the new dtype. + num_proc = 1 + + return samples.map(convert_column, num_proc=num_proc) + + class FilterByValueBlock(Block): def __init__( self, @@ -40,26 +73,12 @@ def __init__( self.convert_dtype = convert_dtype self.num_procs = batch_kwargs.get("num_procs", 1) - def _convert_dtype(self, sample): - try: - sample[self.column_name] = self.convert_dtype(sample[self.column_name]) - except ValueError as e: - logger.error( - "Error converting dtype: %s, filling with None to be filtered later", e - ) - sample[self.column_name] = None - return sample - def generate(self, samples) -> Dataset: if self.convert_dtype: - samples = samples.map( - self._convert_dtype, - num_proc=self.num_procs, + samples = _map_dtype( + samples, self.column_name, self.convert_dtype, self.num_procs ) - return samples.filter( - lambda x: any( - self.operation(x[self.column_name], value) for value in self.value - ), - num_proc=self.num_procs, + return _filter_by_values( + samples, self.column_name, self.operation, self.value, self.num_procs ) From 9d925486db22b7ff3fed66822912e894ecbcbd39 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Thu, 11 Jul 2024 22:08:43 +0100 Subject: [PATCH 04/28] Fix multiprocessing issues in utilblocks Address the following issue with using num_proc>1 with Dataset.map(): ``` File "/usr/lib64/python3.11/pickle.py", line 578, in save rv = reduce(self.proto) ^^^^^^^^^^^^^^^^^^ TypeError: cannot pickle 'SSLContext' object ``` The entire block object is being serialized to sent to the multiprocessing worker. And now that this includes PipelineContext, which includes the OpenAI client object, which includes SSLContext, we hit a known issue: uqfoundation/dill#308 Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/utilblocks.py | 53 +++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index a93a5742..4da8330c 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -27,13 +27,18 @@ def __init__( self.column_name = column_name self.num_procs = batch_kwargs.get("num_procs", 8) - def _generate(self, sample) -> dict: - sample = {**sample, **self.configs[sample[self.column_name]]} - return sample + # Using a static method to avoid serializing self when using multiprocessing + @staticmethod + def _map_populate(samples, configs, column_name, num_proc=1): + def populate(sample): + return {**sample, **configs[sample[column_name]]} + + return samples.map(populate, num_proc) def generate(self, samples) -> Dataset: - samples = samples.map(self._generate, num_proc=self.num_procs) - return samples + return self._map_populate_samples( + samples, self.configs, self.column_name, self.num_procs + ) class SelectorBlock(Block): @@ -44,13 +49,23 @@ def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> N self.output_col = output_col self.num_procs = batch_kwargs.get("num_procs", 8) - def _generate(self, sample) -> dict: - sample[self.output_col] = sample[self.choice_map[sample[self.choice_col]]] - return sample + # Using a static method to avoid serializing self when using multiprocessing + @staticmethod + def _map_select_choice(samples, choice_map, choice_col, output_col, num_proc=1): + def select_choice(sample) -> dict: + sample[output_col] = sample[choice_map[sample[choice_col]]] + return sample + + return samples.map(select_choice, num_proc) def generate(self, samples: Dataset) -> Dataset: - samples = samples.map(self._generate, num_proc=self.num_procs) - return samples + return self._map_select_choice( + samples, + self.choice_map, + self.choice_col, + self.output_col, + self.num_procs, + ) class CombineColumnsBlock(Block): @@ -63,12 +78,16 @@ def __init__( self.separator = separator self.num_procs = batch_kwargs.get("num_procs", 8) - def _generate(self, sample) -> dict: - sample[self.output_col] = self.separator.join( - [sample[col] for col in self.columns] - ) - return sample + # Using a static method to avoid serializing self when using multiprocessing + @staticmethod + def _map_combine(samples, columns, output_col, separator, num_proc=1): + def combine(sample): + sample[output_col] = separator.join([sample[col] for col in columns]) + return sample + + return samples.map(combine, num_proc=num_proc) def generate(self, samples: Dataset) -> Dataset: - samples = samples.map(self._generate, num_proc=self.num_procs) - return samples + return self._map_combine( + samples, self.columns, self.output_col, self.separator, self.num_procs + ) From 23dd08ea73804a5d765dad0214f1bbb160c1ba66 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 00:02:39 +0100 Subject: [PATCH 05/28] Allow block_config.config_path to be relative In order to remove another runtime parameter from pipeline definitions to allow us to move to using YAML files. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/block.py | 6 +++ src/instructlab/sdg/default_flows.py | 81 +++++++--------------------- 2 files changed, 26 insertions(+), 61 deletions(-) diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index e8807420..a28136c4 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -3,6 +3,7 @@ from abc import ABC from collections import ChainMap from typing import Any, Dict, Union +import os.path # Third Party import yaml @@ -42,8 +43,13 @@ def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]: """ Load the configuration file for this block. + If the supplied configuration file is a relative path, it is assumed + to be part of this Python package. + :param config_path: The path to the configuration file. :return: The loaded configuration. """ + if not os.path.isabs(config_path): + config_path = os.path.join(self.ctx.sdg_base, config_path) with open(config_path, "r", encoding="utf-8") as config_file: return yaml.safe_load(config_file) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index ab6396d2..2839e212 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -2,7 +2,6 @@ # Standard from abc import ABC, abstractmethod import operator -import os # Local from .filterblock import FilterByValueBlock @@ -42,8 +41,8 @@ def get_flow(self) -> list: class SimpleKnowledgeFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["config_path"] = os.path.join( - self.ctx.sdg_base, "configs/knowledge/simple_generate_qa.yaml" + flow[0]["block_config"]["config_path"] = ( + "configs/knowledge/simple_generate_qa.yaml" ) flow[0]["block_config"]["block_name"] = "gen_knowledge" return flow @@ -52,8 +51,8 @@ def get_flow(self) -> list: class SimpleFreeformSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["config_path"] = os.path.join( - self.ctx.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" + flow[0]["block_config"]["config_path"] = ( + "configs/skills/simple_generate_qa_freeform.yaml" ) flow[0]["block_config"]["block_name"] = "gen_skill_freeform" return flow @@ -62,8 +61,8 @@ def get_flow(self) -> list: class SimpleGroundedSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["config_path"] = os.path.join( - self.ctx.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" + flow[0]["block_config"]["config_path"] = ( + "configs/skills/simple_generate_qa_grounded.yaml" ) flow[0]["block_config"]["block_name"] = "gen_skill_grounded" return flow @@ -76,9 +75,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_mmlu_knowledge", - "config_path": os.path.join( - self.ctx.sdg_base, "configs/knowledge/mcq_generation.yaml" - ), + "config_path": "configs/knowledge/mcq_generation.yaml", "output_cols": ["mmlubench_question", "mmlubench_answer"], }, "gen_kwargs": { @@ -97,10 +94,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_knowledge", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/knowledge/generate_questions_responses.yaml", - ), + "config_path": "configs/knowledge/generate_questions_responses.yaml", "output_cols": ["question", "response"], "parser_kwargs": { "parser_name": "custom", @@ -117,10 +111,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "eval_faithfulness_qa_pair", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/knowledge/evaluate_faithfulness.yaml", - ), + "config_path": "configs/knowledge/evaluate_faithfulness.yaml", "output_cols": ["explanation", "judgment"], }, "gen_kwargs": { @@ -144,10 +135,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "eval_relevancy_qa_pair", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/knowledge/evaluate_relevancy.yaml", - ), + "config_path": "configs/knowledge/evaluate_relevancy.yaml", "output_cols": ["feedback", "score"], }, "gen_kwargs": { @@ -172,9 +160,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "eval_verify_question", - "config_path": os.path.join( - self.ctx.sdg_base, "configs/knowledge/evaluate_question.yaml" - ), + "config_path": "configs/knowledge/evaluate_question.yaml", "output_cols": ["explanation", "rating"], }, "gen_kwargs": { @@ -205,10 +191,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_questions", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/freeform_questions.yaml", - ), + "config_path": "configs/skills/freeform_questions.yaml", "output_cols": ["question"], "batch_kwargs": { "num_samples": self.ctx.num_instructions_to_generate, @@ -220,10 +203,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "eval_questions", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/evaluate_freeform_questions.yaml", - ), + "config_path": "configs/skills/evaluate_freeform_questions.yaml", "output_cols": ["evaluation", "score"], }, }, @@ -245,10 +225,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_responses", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/freeform_responses.yaml", - ), + "config_path": "configs/skills/freeform_responses.yaml", "output_cols": ["response"], }, }, @@ -256,10 +233,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "evaluate_qa_pair", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/evaluate_freeform_pair.yaml", - ), + "config_path": "configs/skills/evaluate_freeform_pair.yaml", "output_cols": ["evaluation", "score"], }, }, @@ -287,10 +261,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_contexts", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/contexts.yaml", - ), + "config_path": "configs/skills/contexts.yaml", "output_cols": ["context"], }, "gen_kwargs": { @@ -304,10 +275,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_grounded_questions", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/grounded_questions.yaml", - ), + "config_path": "configs/skills/grounded_questions.yaml", "output_cols": ["question"], "batch_kwargs": { "num_samples": 3, @@ -319,10 +287,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "eval_grounded_questions", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/evaluate_grounded_questions.yaml", - ), + "config_path": "configs/skills/evaluate_grounded_questions.yaml", "output_cols": ["evaluation", "score"], }, }, @@ -344,10 +309,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "gen_grounded_responses", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/grounded_responses.yaml", - ), + "config_path": "configs/skills/grounded_responses.yaml", "output_cols": ["response"], }, }, @@ -355,10 +317,7 @@ def get_flow(self) -> list: "block_type": LLMBlock, "block_config": { "block_name": "evaluate_grounded_qa_pair", - "config_path": os.path.join( - self.ctx.sdg_base, - "configs/skills/evaluate_grounded_pair.yaml", - ), + "config_path": "configs/skills/evaluate_grounded_pair.yaml", "output_cols": ["evaluation", "score"], }, }, From 9fc272ca1e01962410ba09c87def800b29629076 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 00:14:40 +0100 Subject: [PATCH 06/28] Fix block_name handling All Block subclasses but LLMBlock are failing to pass the block_name from block_config down to the base class, instead they are incorrectly passing the block type as its name. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/filterblock.py | 4 +++- src/instructlab/sdg/utilblocks.py | 16 ++++++++-------- tests/test_filterblock.py | 2 ++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 96d3e7af..afb58b7b 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -46,6 +46,7 @@ class FilterByValueBlock(Block): def __init__( self, ctx, + block_name, filter_column, filter_value, operation, @@ -57,6 +58,7 @@ def __init__( Parameters: - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - block_name (str): An identifier for this block. - filter_column (str): The name of the column in the dataset to apply the filter on. - filter_value (any or list of any): The value(s) to filter by. - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset. @@ -66,7 +68,7 @@ def __init__( Returns: None """ - super().__init__(ctx, block_name=self.__class__.__name__) + super().__init__(ctx, block_name) self.value = filter_value if isinstance(filter_value, list) else [filter_value] self.column_name = filter_column self.operation = operation diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 4da8330c..871b2ce8 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -11,11 +11,9 @@ class SamplePopulatorBlock(Block): def __init__( - self, ctx, config_paths, column_name, post_fix="", **batch_kwargs + self, ctx, block_name, config_paths, column_name, post_fix="", **batch_kwargs ) -> None: - super().__init__( - ctx, block_name=self.__class__.__name__ - ) # Call the base class's __init__ + super().__init__(ctx, block_name) self.configs = {} for config in config_paths: if post_fix: @@ -42,8 +40,10 @@ def generate(self, samples) -> Dataset: class SelectorBlock(Block): - def __init__(self, ctx, choice_map, choice_col, output_col, **batch_kwargs) -> None: - super().__init__(ctx, block_name=self.__class__.__name__) + def __init__( + self, ctx, block_name, choice_map, choice_col, output_col, **batch_kwargs + ) -> None: + super().__init__(ctx, block_name) self.choice_map = choice_map self.choice_col = choice_col self.output_col = output_col @@ -70,9 +70,9 @@ def generate(self, samples: Dataset) -> Dataset: class CombineColumnsBlock(Block): def __init__( - self, ctx, columns, output_col, separator="\n\n", **batch_kwargs + self, ctx, block_name, columns, output_col, separator="\n\n", **batch_kwargs ) -> None: - super().__init__(ctx, block_name=self.__class__.__name__) + super().__init__(ctx, block_name) self.columns = columns self.output_col = output_col self.separator = separator diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py index 53531fd0..5e00c80b 100644 --- a/tests/test_filterblock.py +++ b/tests/test_filterblock.py @@ -15,6 +15,7 @@ class TestFilterByValueBlock(unittest.TestCase): def setUp(self): self.block = FilterByValueBlock( PipelineContext(None, None, None, None), + block_name="filter_by_age", filter_column="age", filter_value=30, operation=operator.eq, @@ -22,6 +23,7 @@ def setUp(self): ) self.block_with_list = FilterByValueBlock( PipelineContext(None, None, None, None), + block_name="filter_by_ages", filter_column="age", filter_value=[30, 35], operation=operator.eq, From 8cb673b2b06d0734e08b6a018d5a4f3102589f67 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 00:36:35 +0100 Subject: [PATCH 07/28] Move FilterByValue multiprocessing config to PipelineContext In every use of FilterByValue in the default flows, we use batch_kwargs to set num_proc=8. This doesn't appear to be a pipeline author concern, but rather a runtime parameter which should in future be based on the number of available CPUs and (perhaps) user configuration. For now, just move it from batch_kwargs to PipelineContext. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/default_flows.py | 25 ------------------------- src/instructlab/sdg/filterblock.py | 7 ++----- src/instructlab/sdg/pipeline.py | 2 ++ src/instructlab/sdg/utilblocks.py | 21 ++++++--------------- 4 files changed, 10 insertions(+), 45 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 2839e212..f7e0419e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -125,9 +125,6 @@ def get_flow(self) -> list: "filter_column": "judgment", "filter_value": "YES", "operation": operator.eq, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["judgment", "explanation"], }, @@ -150,9 +147,6 @@ def get_flow(self) -> list: "filter_value": 2.0, "operation": operator.eq, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["feedback", "score"], }, @@ -175,9 +169,6 @@ def get_flow(self) -> list: "filter_value": 1.0, "operation": operator.eq, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["explanation", "rating", "__index_level_0__"], }, @@ -215,9 +206,6 @@ def get_flow(self) -> list: "filter_value": 1.0, "operation": operator.eq, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["evaluation", "score", "num_samples"], }, @@ -245,9 +233,6 @@ def get_flow(self) -> list: "filter_value": 2.0, "operation": operator.ge, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["evaluation", "score"], }, @@ -299,9 +284,6 @@ def get_flow(self) -> list: "filter_value": 1.0, "operation": operator.eq, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, "drop_columns": ["evaluation", "score", "num_samples"], }, @@ -329,9 +311,6 @@ def get_flow(self) -> list: "filter_value": 2.0, "operation": operator.ge, "convert_dtype": float, - "batch_kwargs": { - "num_procs": 8, - }, }, }, { @@ -340,10 +319,6 @@ def get_flow(self) -> list: "block_name": "combine_question_and_context", "columns": ["context", "question"], "output_col": "question", - "batch_kwargs": { - "num_procs": 8, - "batched": True, - }, }, }, ] diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index afb58b7b..5b820df5 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -51,7 +51,6 @@ def __init__( filter_value, operation, convert_dtype=None, - **batch_kwargs, ) -> None: """ Initializes a new instance of the FilterByValueBlock class. @@ -63,7 +62,6 @@ def __init__( - filter_value (any or list of any): The value(s) to filter by. - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset. - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None. - - **batch_kwargs: Additional kwargs for batch processing. Returns: None @@ -73,14 +71,13 @@ def __init__( self.column_name = filter_column self.operation = operation self.convert_dtype = convert_dtype - self.num_procs = batch_kwargs.get("num_procs", 1) def generate(self, samples) -> Dataset: if self.convert_dtype: samples = _map_dtype( - samples, self.column_name, self.convert_dtype, self.num_procs + samples, self.column_name, self.convert_dtype, self.ctx.num_procs ) return _filter_by_values( - samples, self.column_name, self.operation, self.value, self.num_procs + samples, self.column_name, self.operation, self.value, self.ctx.num_procs ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 93464601..a9db1970 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -20,6 +20,8 @@ def __init__( self.model_id = model_id self.num_instructions_to_generate = num_instructions_to_generate self.sdg_base = resources.files(__package__) + # FIXME: base this on the available number of CPUs + self.num_procs = 8 class Pipeline: diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 871b2ce8..b4e39a5b 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -10,9 +10,7 @@ class SamplePopulatorBlock(Block): - def __init__( - self, ctx, block_name, config_paths, column_name, post_fix="", **batch_kwargs - ) -> None: + def __init__(self, ctx, block_name, config_paths, column_name, post_fix="") -> None: super().__init__(ctx, block_name) self.configs = {} for config in config_paths: @@ -23,7 +21,6 @@ def __init__( config_key = config.split("/")[-1].split(".")[0] self.configs[config_key] = self._load_config(config_name) self.column_name = column_name - self.num_procs = batch_kwargs.get("num_procs", 8) # Using a static method to avoid serializing self when using multiprocessing @staticmethod @@ -35,19 +32,16 @@ def populate(sample): def generate(self, samples) -> Dataset: return self._map_populate_samples( - samples, self.configs, self.column_name, self.num_procs + samples, self.configs, self.column_name, self.ctx.num_procs ) class SelectorBlock(Block): - def __init__( - self, ctx, block_name, choice_map, choice_col, output_col, **batch_kwargs - ) -> None: + def __init__(self, ctx, block_name, choice_map, choice_col, output_col) -> None: super().__init__(ctx, block_name) self.choice_map = choice_map self.choice_col = choice_col self.output_col = output_col - self.num_procs = batch_kwargs.get("num_procs", 8) # Using a static method to avoid serializing self when using multiprocessing @staticmethod @@ -64,19 +58,16 @@ def generate(self, samples: Dataset) -> Dataset: self.choice_map, self.choice_col, self.output_col, - self.num_procs, + self.ctx.num_procs, ) class CombineColumnsBlock(Block): - def __init__( - self, ctx, block_name, columns, output_col, separator="\n\n", **batch_kwargs - ) -> None: + def __init__(self, ctx, block_name, columns, output_col, separator="\n\n") -> None: super().__init__(ctx, block_name) self.columns = columns self.output_col = output_col self.separator = separator - self.num_procs = batch_kwargs.get("num_procs", 8) # Using a static method to avoid serializing self when using multiprocessing @staticmethod @@ -89,5 +80,5 @@ def combine(sample): def generate(self, samples: Dataset) -> Dataset: return self._map_combine( - samples, self.columns, self.output_col, self.separator, self.num_procs + samples, self.columns, self.output_col, self.separator, self.ctx.num_procs ) From b956643bce940a2943d2c4c674d4115096fcb67b Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 8 Jul 2024 11:19:48 +0100 Subject: [PATCH 08/28] Add `add_num_samples` to LLMBlock config Two pipelines include an LLMBlock which use `{num_samples}` in their instructions to the teacher model. There needs to be some way to configure the LLMBlock so that `num_samples` will be included, but as per #82 (commit a01b04e) the value of `num_samples` should be based on the `num_instructions_to_generate` parameter. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/default_flows.py | 8 ++------ src/instructlab/sdg/llmblock.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index f7e0419e..056ac861 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -184,9 +184,7 @@ def get_flow(self) -> list: "block_name": "gen_questions", "config_path": "configs/skills/freeform_questions.yaml", "output_cols": ["question"], - "batch_kwargs": { - "num_samples": self.ctx.num_instructions_to_generate, - }, + "add_num_samples": True, }, "drop_duplicates": ["question"], }, @@ -262,9 +260,7 @@ def get_flow(self) -> list: "block_name": "gen_grounded_questions", "config_path": "configs/skills/grounded_questions.yaml", "output_cols": ["question"], - "batch_kwargs": { - "num_samples": 3, - }, + "add_num_samples": True, }, "drop_duplicates": ["question"], }, diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index eaa58556..4a32a708 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -59,6 +59,7 @@ def __init__( block_name, config_path, output_cols, + add_num_samples=False, parser_kwargs={}, **batch_kwargs, ) -> None: @@ -69,6 +70,7 @@ def __init__( ) self.prompt_template = self.prompt_struct.format(**self.block_config) self.model_prompt = _get_model_prompt(self.ctx.model_family) + self.add_num_samples = add_num_samples self.output_cols = output_cols self.batch_params = batch_kwargs.get("batch_kwargs", {}) self.parser_name = parser_kwargs.get("parser_name", None) @@ -156,11 +158,12 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: :return: The parsed output after generation. """ - num_samples = self.batch_params.get("num_samples", None) logger.debug("Generating outputs for {} samples".format(len(samples))) - if (num_samples is not None) and ("num_samples" not in samples.column_names): - samples = samples.add_column("num_samples", [num_samples] * len(samples)) + if self.add_num_samples and ("num_samples" not in samples.column_names): + samples = samples.add_column( + "num_samples", [self.ctx.num_instructions_to_generate] * len(samples) + ) # validate each sample # Log errors and remove invalid samples @@ -211,6 +214,7 @@ def __init__( config_paths, output_cols, selector_column_name, + add_num_samples=False, parser_kwargs={}, **batch_kwargs, ) -> None: @@ -219,6 +223,7 @@ def __init__( block_name, config_paths[0][0], output_cols, + add_num_samples=add_num_samples, parser_kwargs=parser_kwargs, **batch_kwargs, ) From 18f1513897f6d31f0dd059e398305fa8792843bf Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 01:10:14 +0100 Subject: [PATCH 09/28] Fix LLMBlock batch_kwargs constructor param It's hard to spot, but this: def __init__(self, ..., **batch_kwargs): ... self.batch_params = batch_kwargs.get("batch_kwargs", {}) is equivalent to this: def __init__(self, ..., **kwargs): ... self.batch_params = kwargs.get("batch_kwargs", {}) which is equivalent to this: def __init__(self, ..., batch_kwargs={}, **kwargs): ... self.batch_params = batch_kwargs except that trailing **kwargs meant we were silently accepting unknown block_config parameters. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/llmblock.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 4a32a708..fc794158 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -61,7 +61,7 @@ def __init__( output_cols, add_num_samples=False, parser_kwargs={}, - **batch_kwargs, + batch_kwargs={}, ) -> None: super().__init__(ctx, block_name) self.block_config = self._load_config(config_path) @@ -72,7 +72,7 @@ def __init__( self.model_prompt = _get_model_prompt(self.ctx.model_family) self.add_num_samples = add_num_samples self.output_cols = output_cols - self.batch_params = batch_kwargs.get("batch_kwargs", {}) + self.batch_params = batch_kwargs self.parser_name = parser_kwargs.get("parser_name", None) self.parsing_pattern = parser_kwargs.get("parsing_pattern", None) self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None) @@ -216,7 +216,7 @@ def __init__( selector_column_name, add_num_samples=False, parser_kwargs={}, - **batch_kwargs, + batch_kwargs={}, ) -> None: super().__init__( ctx, @@ -225,7 +225,7 @@ def __init__( output_cols, add_num_samples=add_num_samples, parser_kwargs=parser_kwargs, - **batch_kwargs, + batch_kwargs=batch_kwargs, ) self.selector_column_name = selector_column_name self.prompt_template = {} From 82aadd9f6582093ee2516728e47d8119a204e604 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 01:13:32 +0100 Subject: [PATCH 10/28] Remove batch_kwargs This appears to be unused now - now pipeline definitions include it, and it's not used in LLMBlock anywhere. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/llmblock.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index fc794158..83e88621 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -61,7 +61,6 @@ def __init__( output_cols, add_num_samples=False, parser_kwargs={}, - batch_kwargs={}, ) -> None: super().__init__(ctx, block_name) self.block_config = self._load_config(config_path) @@ -72,7 +71,6 @@ def __init__( self.model_prompt = _get_model_prompt(self.ctx.model_family) self.add_num_samples = add_num_samples self.output_cols = output_cols - self.batch_params = batch_kwargs self.parser_name = parser_kwargs.get("parser_name", None) self.parsing_pattern = parser_kwargs.get("parsing_pattern", None) self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None) @@ -216,7 +214,6 @@ def __init__( selector_column_name, add_num_samples=False, parser_kwargs={}, - batch_kwargs={}, ) -> None: super().__init__( ctx, @@ -225,7 +222,6 @@ def __init__( output_cols, add_num_samples=add_num_samples, parser_kwargs=parser_kwargs, - batch_kwargs=batch_kwargs, ) self.selector_column_name = selector_column_name self.prompt_template = {} From 07c1c6d15407a271cfd8861876539e6afdc1f17f Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 5 Jul 2024 13:22:05 +0100 Subject: [PATCH 11/28] Add a YAML based file format for pipelines See instructlab/dev-docs#109 In order to support custom pipelines, add a YAML based file format. However, to make the default pipelines easier to reason about and develop, also convert them to the YAML file format. This changes the top-level API from: ``` mmlu_block_configs = MMLUBenchFlow().get_flow() knowledge_block_configs = SynthKnowledgeFlow().get_flow() knowledge_pipe = Pipeline(ctx, mmlu_flow + knowledge_flow) ``` to: ``` knowledge_pipe = Pipeline.from_flows( ctx, [pipeline.MMLU_BENCH_FLOW, pipeline.SYNTH_KNOWLEDGE_FLOW] ) ``` Co-authored-by: Aakanksha Duggal Co-authored-by: Kai Xu Co-authored-by: Russell Bryant Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 10 +- scripts/test_grounded_skills.py | 10 +- scripts/test_knowledge.py | 12 +- src/instructlab/sdg/default_flows.py | 320 ------------------ src/instructlab/sdg/filterblock.py | 37 +- src/instructlab/sdg/flows/__init__.py | 0 src/instructlab/sdg/flows/mmlu_bench.yaml | 14 + .../sdg/flows/simple_freeform_skills.yaml | 13 + .../sdg/flows/simple_grounded_skills.yaml | 13 + .../sdg/flows/simple_knowledge.yaml | 13 + .../sdg/flows/synth_freeform_skills.yaml | 52 +++ .../sdg/flows/synth_grounded_skills.yaml | 65 ++++ .../sdg/flows/synth_knowledge.yaml | 75 ++++ src/instructlab/sdg/generate_data.py | 63 ++-- src/instructlab/sdg/llmblock.py | 10 +- src/instructlab/sdg/pipeline.py | 71 +++- tests/test_default_flows.py | 49 +++ tests/test_filterblock.py | 16 +- 18 files changed, 461 insertions(+), 382 deletions(-) delete mode 100644 src/instructlab/sdg/default_flows.py create mode 100644 src/instructlab/sdg/flows/__init__.py create mode 100644 src/instructlab/sdg/flows/mmlu_bench.yaml create mode 100644 src/instructlab/sdg/flows/simple_freeform_skills.yaml create mode 100644 src/instructlab/sdg/flows/simple_grounded_skills.yaml create mode 100644 src/instructlab/sdg/flows/simple_knowledge.yaml create mode 100644 src/instructlab/sdg/flows/synth_freeform_skills.yaml create mode 100644 src/instructlab/sdg/flows/synth_grounded_skills.yaml create mode 100644 src/instructlab/sdg/flows/synth_knowledge.yaml create mode 100644 tests/test_default_flows.py diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 058fd64f..4c120264 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -4,8 +4,11 @@ # First Party from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline, PipelineContext +from src.instructlab.sdg.pipeline import ( + SYNTH_FREEFORM_SKILLS_FLOW, + Pipeline, + PipelineContext, +) # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" @@ -51,8 +54,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -skills_flow = SynthSkillsFlow(ctx).get_flow() -skills_pipe = Pipeline(skills_flow) +skills_pipe = Pipeline.from_flows(ctx, [SYNTH_FREEFORM_SKILLS_FLOW]) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 6d0bdc1b..63aa2fcd 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -4,8 +4,11 @@ # First Party from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import SynthGroundedSkillsFlow -from src.instructlab.sdg.pipeline import Pipeline, PipelineContext +from src.instructlab.sdg.pipeline import ( + SYNTH_GROUNDED_SKILLS_FLOW, + Pipeline, + PipelineContext, +) # for vLLM endpoints, the api_key remains "EMPTY" openai_api_key = "EMPTY" @@ -99,8 +102,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 10) -skills_flow = SynthGroundedSkillsFlow(ctx).get_flow() -skills_pipe = Pipeline(skills_flow) +skills_pipe = Pipeline.from_flows(ctx, [SYNTH_GROUNDED_SKILLS_FLOW]) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 2b534903..32747dc1 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -7,8 +7,12 @@ # First Party from src.instructlab.sdg import SDG -from src.instructlab.sdg.default_flows import MMLUBenchFlow, SynthKnowledgeFlow -from src.instructlab.sdg.pipeline import Pipeline, PipelineContext +from src.instructlab.sdg.pipeline import ( + MMLU_BENCH_FLOW, + SYNTH_KNOWLEDGE_FLOW, + Pipeline, + PipelineContext, +) # Please don't add you vLLM endpoint key here openai_api_key = "EMPTY" @@ -40,9 +44,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -mmlu_flow = MMLUBenchFlow(ctx).get_flow() -knowledge_flow = SynthKnowledgeFlow(ctx).get_flow() -knowledge_pipe = Pipeline(mmlu_flow + knowledge_flow) +knowledge_pipe = Pipeline.from_flows(ctx, [MMLU_BENCH_FLOW, SYNTH_KNOWLEDGE_FLOW]) sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py deleted file mode 100644 index 056ac861..00000000 --- a/src/instructlab/sdg/default_flows.py +++ /dev/null @@ -1,320 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Standard -from abc import ABC, abstractmethod -import operator - -# Local -from .filterblock import FilterByValueBlock -from .llmblock import LLMBlock -from .utilblocks import CombineColumnsBlock - - -class Flow(ABC): - def __init__(self, ctx) -> None: - self.ctx = ctx - - @abstractmethod - def get_flow(self) -> list: - pass - - -class _SimpleFlow(Flow): - def get_flow(self) -> list: - return [ - { - "block_type": LLMBlock, - "block_config": { - "block_name": "", # must be set by subclass - "config_path": "", # must be set by subclass - "output_cols": ["output"], - }, - "gen_kwargs": { - "max_tokens": 2048, - "temperature": 0.7, - "n": self.ctx.num_instructions_to_generate, - }, - "drop_duplicates": ["output"], - } - ] - - -class SimpleKnowledgeFlow(_SimpleFlow): - def get_flow(self) -> list: - flow = super().get_flow() - flow[0]["block_config"]["config_path"] = ( - "configs/knowledge/simple_generate_qa.yaml" - ) - flow[0]["block_config"]["block_name"] = "gen_knowledge" - return flow - - -class SimpleFreeformSkillFlow(_SimpleFlow): - def get_flow(self) -> list: - flow = super().get_flow() - flow[0]["block_config"]["config_path"] = ( - "configs/skills/simple_generate_qa_freeform.yaml" - ) - flow[0]["block_config"]["block_name"] = "gen_skill_freeform" - return flow - - -class SimpleGroundedSkillFlow(_SimpleFlow): - def get_flow(self) -> list: - flow = super().get_flow() - flow[0]["block_config"]["config_path"] = ( - "configs/skills/simple_generate_qa_grounded.yaml" - ) - flow[0]["block_config"]["block_name"] = "gen_skill_grounded" - return flow - - -class MMLUBenchFlow(Flow): - def get_flow(self) -> list: - return [ - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_mmlu_knowledge", - "config_path": "configs/knowledge/mcq_generation.yaml", - "output_cols": ["mmlubench_question", "mmlubench_answer"], - }, - "gen_kwargs": { - "temperature": 0, - "max_tokens": 2048, - }, - "drop_duplicates": ["mmlubench_question"], - }, - ] - - -class SynthKnowledgeFlow(Flow): - def get_flow(self) -> list: - return [ - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_knowledge", - "config_path": "configs/knowledge/generate_questions_responses.yaml", - "output_cols": ["question", "response"], - "parser_kwargs": { - "parser_name": "custom", - "parsing_pattern": r"\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)", - "parser_cleanup_tags": ["[END]"], - }, - }, - "gen_kwargs": { - "max_tokens": 2048, - }, - "drop_duplicates": ["question"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "eval_faithfulness_qa_pair", - "config_path": "configs/knowledge/evaluate_faithfulness.yaml", - "output_cols": ["explanation", "judgment"], - }, - "gen_kwargs": { - "max_tokens": 2048, - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_faithfulness", - "filter_column": "judgment", - "filter_value": "YES", - "operation": operator.eq, - }, - "drop_columns": ["judgment", "explanation"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "eval_relevancy_qa_pair", - "config_path": "configs/knowledge/evaluate_relevancy.yaml", - "output_cols": ["feedback", "score"], - }, - "gen_kwargs": { - "max_tokens": 2048, - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_relevancy", - "filter_column": "score", - "filter_value": 2.0, - "operation": operator.eq, - "convert_dtype": float, - }, - "drop_columns": ["feedback", "score"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "eval_verify_question", - "config_path": "configs/knowledge/evaluate_question.yaml", - "output_cols": ["explanation", "rating"], - }, - "gen_kwargs": { - "max_tokens": 2048, - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_verify_question", - "filter_column": "rating", - "filter_value": 1.0, - "operation": operator.eq, - "convert_dtype": float, - }, - "drop_columns": ["explanation", "rating", "__index_level_0__"], - }, - ] - - -class SynthSkillsFlow(Flow): - def get_flow(self) -> list: - return [ - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_questions", - "config_path": "configs/skills/freeform_questions.yaml", - "output_cols": ["question"], - "add_num_samples": True, - }, - "drop_duplicates": ["question"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "eval_questions", - "config_path": "configs/skills/evaluate_freeform_questions.yaml", - "output_cols": ["evaluation", "score"], - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_questions", - "filter_column": "score", - "filter_value": 1.0, - "operation": operator.eq, - "convert_dtype": float, - }, - "drop_columns": ["evaluation", "score", "num_samples"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_responses", - "config_path": "configs/skills/freeform_responses.yaml", - "output_cols": ["response"], - }, - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "evaluate_qa_pair", - "config_path": "configs/skills/evaluate_freeform_pair.yaml", - "output_cols": ["evaluation", "score"], - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_qa_pair", - "filter_column": "score", - "filter_value": 2.0, - "operation": operator.ge, - "convert_dtype": float, - }, - "drop_columns": ["evaluation", "score"], - }, - ] - - -class SynthGroundedSkillsFlow(Flow): - def get_flow(self) -> list: - return [ - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_contexts", - "config_path": "configs/skills/contexts.yaml", - "output_cols": ["context"], - }, - "gen_kwargs": { - "temperature": 0.7, - "max_tokens": 2048, - "n": self.ctx.num_instructions_to_generate, - }, - "drop_duplicates": ["context"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_grounded_questions", - "config_path": "configs/skills/grounded_questions.yaml", - "output_cols": ["question"], - "add_num_samples": True, - }, - "drop_duplicates": ["question"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "eval_grounded_questions", - "config_path": "configs/skills/evaluate_grounded_questions.yaml", - "output_cols": ["evaluation", "score"], - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_grounded_questions", - "filter_column": "score", - "filter_value": 1.0, - "operation": operator.eq, - "convert_dtype": float, - }, - "drop_columns": ["evaluation", "score", "num_samples"], - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "gen_grounded_responses", - "config_path": "configs/skills/grounded_responses.yaml", - "output_cols": ["response"], - }, - }, - { - "block_type": LLMBlock, - "block_config": { - "block_name": "evaluate_grounded_qa_pair", - "config_path": "configs/skills/evaluate_grounded_pair.yaml", - "output_cols": ["evaluation", "score"], - }, - }, - { - "block_type": FilterByValueBlock, - "block_config": { - "block_name": "filter_grounded_qa_pair", - "filter_column": "score", - "filter_value": 2.0, - "operation": operator.ge, - "convert_dtype": float, - }, - }, - { - "block_type": CombineColumnsBlock, - "block_config": { - "block_name": "combine_question_and_context", - "columns": ["context", "question"], - "output_col": "question", - }, - }, - ] diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 5b820df5..9fcbe5c0 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# Standard +import operator + # Third Party from datasets import Dataset @@ -9,6 +12,34 @@ logger = setup_logger(__name__) +class FilterByValueBlockError(Exception): + """An exception raised by the FilterByValue block.""" + + +def _get_operator_func(op): + if not op in dir(operator): + raise FilterByValueBlockError("Unknown FilterByValueBlock operation '{op}'") + return getattr(operator, op) + + +def _get_convert_dtype(convert_dtype): + if not convert_dtype: + return None + + type_mapping = { + "int": int, + "float": float, + "bool": bool, + } + + if not convert_dtype in type_mapping: + raise FilterByValueBlockError( + "Unknown FilterByValueBlock convert_dtype '{convert_dtype}'" + ) + + return type_mapping[convert_dtype] + + # Note - this is not a method on the class below in order to avoid # serializing the object itself when multi-processing is used. # In particular, SSLContext - embedded in the OpenAI client object - @@ -69,8 +100,10 @@ def __init__( super().__init__(ctx, block_name) self.value = filter_value if isinstance(filter_value, list) else [filter_value] self.column_name = filter_column - self.operation = operation - self.convert_dtype = convert_dtype + self.operation = _get_operator_func(operation) + self.convert_dtype = _get_convert_dtype(convert_dtype) + if self.convert_dtype: + self.value = [self.convert_dtype(value) for value in self.value] def generate(self, samples) -> Dataset: if self.convert_dtype: diff --git a/src/instructlab/sdg/flows/__init__.py b/src/instructlab/sdg/flows/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/flows/mmlu_bench.yaml b/src/instructlab/sdg/flows/mmlu_bench.yaml new file mode 100644 index 00000000..0555e0a9 --- /dev/null +++ b/src/instructlab/sdg/flows/mmlu_bench.yaml @@ -0,0 +1,14 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_mmlu_knowledge + config_path: configs/knowledge/mcq_generation.yaml + output_cols: + - mmlubench_question + - mmlubench_answer + gen_kwargs: + temperature: 0 + max_tokens: 2048 + drop_duplicates: + - mmlubench_question diff --git a/src/instructlab/sdg/flows/simple_freeform_skills.yaml b/src/instructlab/sdg/flows/simple_freeform_skills.yaml new file mode 100644 index 00000000..deac2875 --- /dev/null +++ b/src/instructlab/sdg/flows/simple_freeform_skills.yaml @@ -0,0 +1,13 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_skill_freeform + config_path: configs/skills/simple_generate_qa_freeform.yaml + output_cols: + - output + gen_kwargs: + max_tokens: 2048 + temperature: 0.7 + drop_duplicates: + - output diff --git a/src/instructlab/sdg/flows/simple_grounded_skills.yaml b/src/instructlab/sdg/flows/simple_grounded_skills.yaml new file mode 100644 index 00000000..f20c3784 --- /dev/null +++ b/src/instructlab/sdg/flows/simple_grounded_skills.yaml @@ -0,0 +1,13 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_skill_grounded + config_path: configs/skills/simple_generate_qa_grounded.yaml + output_cols: + - output + gen_kwargs: + max_tokens: 2048 + temperature: 0.7 + drop_duplicates: + - output diff --git a/src/instructlab/sdg/flows/simple_knowledge.yaml b/src/instructlab/sdg/flows/simple_knowledge.yaml new file mode 100644 index 00000000..3243faf5 --- /dev/null +++ b/src/instructlab/sdg/flows/simple_knowledge.yaml @@ -0,0 +1,13 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_knowledge + config_path: configs/knowledge/simple_generate_qa.yaml + output_cols: + - output + gen_kwargs: + max_tokens: 2048 + temperature: 0.7 + drop_duplicates: + - output diff --git a/src/instructlab/sdg/flows/synth_freeform_skills.yaml b/src/instructlab/sdg/flows/synth_freeform_skills.yaml new file mode 100644 index 00000000..885ccad3 --- /dev/null +++ b/src/instructlab/sdg/flows/synth_freeform_skills.yaml @@ -0,0 +1,52 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_questions + config_path: configs/skills/freeform_questions.yaml + add_num_samples: True + output_cols: + - question + drop_duplicates: + - question + - block_type: LLMBlock + block_config: + block_name: eval_questions + config_path: configs/skills/evaluate_freeform_questions.yaml + output_cols: + - evaluation + - score + - block_type: FilterByValueBlock + block_config: + block_name: filter_questions + filter_column: score + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - evaluation + - score + - num_samples + - block_type: LLMBlock + block_config: + block_name: gen_responses + config_path: configs/skills/freeform_responses.yaml + output_cols: + - response + - block_type: LLMBlock + block_config: + block_name: evaluate_qa_pair + config_path: configs/skills/evaluate_freeform_pair.yaml + output_cols: + - evaluation + - score + - block_type: FilterByValueBlock + block_config: + block_name: filter_qa_pair + filter_column: score + filter_value: 2.0 + operation: ge + convert_dtype: float + drop_columns: + - evaluation + - score diff --git a/src/instructlab/sdg/flows/synth_grounded_skills.yaml b/src/instructlab/sdg/flows/synth_grounded_skills.yaml new file mode 100644 index 00000000..7aa9c0c7 --- /dev/null +++ b/src/instructlab/sdg/flows/synth_grounded_skills.yaml @@ -0,0 +1,65 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_contexts + config_path: configs/skills/contexts.yaml + output_cols: + - context + gen_kwargs: + temperature: 0.7 + max_tokens: 2048 + - block_type: LLMBlock + block_config: + block_name: gen_grounded_questions + config_path: configs/skills/grounded_questions.yaml + add_num_samples: True + output_cols: + - question + drop_duplicates: + - question + - block_type: LLMBlock + block_config: + block_name: eval_grounded_questions + config_path: configs/skills/evaluate_grounded_questions.yaml + output_cols: + - evaluation + - score + - block_type: FilterByValueBlock + block_config: + block_name: filter_grounded_questions + filter_column: score + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - evaluation + - score + - num_samples + - block_type: LLMBlock + block_config: + block_name: gen_grounded_responses + config_path: configs/skills/grounded_responses.yaml + output_cols: + - response + - block_type: LLMBlock + block_config: + block_name: evaluate_grounded_qa_pair + config_path: configs/skills/evaluate_grounded_pair.yaml + output_cols: + - evaluation + - score + - block_type: FilterByValueBlock + block_config: + block_name: filter_grounded_qa_pair + filter_column: score + filter_value: 2.0 + operation: ge + convert_dtype: float + - block_type: CombineColumnsBlock + block_config: + block_name: combine_question_and_context + columns: + - context + - question + output_col: question diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/flows/synth_knowledge.yaml new file mode 100644 index 00000000..dcb0d9cc --- /dev/null +++ b/src/instructlab/sdg/flows/synth_knowledge.yaml @@ -0,0 +1,75 @@ +version: "1.0" +block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_knowledge + config_path: configs/knowledge/generate_questions_responses.yaml + output_cols: + - question + - response + parser_kwargs: + parser_name: custom + parsing_pattern: '\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)' + parser_cleanup_tags: + - "[END]" + gen_kwargs: + max_tokens: 2048 + drop_duplicates: + - question + - block_type: LLMBlock + block_config: + block_name: eval_faithfulness_qa_pair + config_path: configs/knowledge/evaluate_faithfulness.yaml + output_cols: + - explanation + - judgment + gen_kwargs: + max_tokens: 2048 + - block_type: FilterByValueBlock + block_config: + block_name: filter_faithfulness + filter_column: judgment + filter_value: YES + operation: eq + drop_columns: + - judgment + - explanation + - block_type: LLMBlock + block_config: + block_name: eval_relevancy_qa_pair + config_path: configs/knowledge/evaluate_relevancy.yaml + output_cols: + - feedback + - score + gen_kwargs: + max_tokens: 2048 + - block_type: FilterByValueBlock + block_config: + block_name: filter_relevancy + filter_column: score + filter_value: 2.0 + operation: eq + convert_dtype: float + drop_columns: + - feedback + - score + - block_type: LLMBlock + block_config: + block_name: eval_verify_question + config_path: configs/knowledge/evaluate_question.yaml + output_cols: + - explanation + - rating + gen_kwargs: + max_tokens: 2048 + - block_type: FilterByValueBlock + block_config: + block_name: filter_verify_question + filter_column: rating + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - explanation + - rating + - __index_level_0__ diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index abcd6665..dc2754b6 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -17,17 +17,18 @@ # First Party # pylint: disable=ungrouped-imports from instructlab.sdg import SDG, utils -from instructlab.sdg.default_flows import ( - MMLUBenchFlow, - SimpleFreeformSkillFlow, - SimpleGroundedSkillFlow, - SimpleKnowledgeFlow, - SynthGroundedSkillsFlow, - SynthKnowledgeFlow, - SynthSkillsFlow, -) from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL -from instructlab.sdg.pipeline import Pipeline, PipelineContext +from instructlab.sdg.pipeline import ( + MMLU_BENCH_FLOW, + SIMPLE_FREEFORM_SKILLS_FLOW, + SIMPLE_GROUNDED_SKILLS_FLOW, + SIMPLE_KNOWLEDGE_FLOW, + SYNTH_FREEFORM_SKILLS_FLOW, + SYNTH_GROUNDED_SKILLS_FLOW, + SYNTH_KNOWLEDGE_FLOW, + Pipeline, + PipelineContext, +) from instructlab.sdg.utils import models from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, @@ -167,40 +168,28 @@ def _gen_test_data( outfile.write("\n") -def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate): - knowledge_flow_types = [] - freeform_skill_flow_types = [] - grounded_skill_flow_types = [] +def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate): + knowledge_flows = [] + freeform_skill_flows = [] + grounded_skill_flows = [] if pipeline == "full": - knowledge_flow_types.append(MMLUBenchFlow) - knowledge_flow_types.append(SynthKnowledgeFlow) - freeform_skill_flow_types.append(SynthSkillsFlow) - grounded_skill_flow_types.append(SynthGroundedSkillsFlow) + knowledge_flows.append(MMLU_BENCH_FLOW) + knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW) + freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW) + grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW) elif pipeline == "simple": - knowledge_flow_types.append(SimpleKnowledgeFlow) - freeform_skill_flow_types.append(SimpleFreeformSkillFlow) - grounded_skill_flow_types.append(SimpleGroundedSkillFlow) + knowledge_flows.append(SIMPLE_KNOWLEDGE_FLOW) + freeform_skill_flows.append(SIMPLE_FREEFORM_SKILLS_FLOW) + grounded_skill_flows.append(SIMPLE_GROUNDED_SKILLS_FLOW) else: raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.") - ctx = PipelineContext( - client, model_family, model_name, num_instructions_to_generate - ) - - def build_pipeline(flow_types): - block_configs = [] - for flow_type in flow_types: - block_configs.extend(flow_type(ctx).get_flow()) - return Pipeline(ctx, block_configs) - - knowledge_pipeline = build_pipeline(knowledge_flow_types) - freeform_skill_pipeline = build_pipeline(freeform_skill_flow_types) - grounded_skill_pipeline = build_pipeline(grounded_skill_flow_types) + ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate) return ( - SDG([knowledge_pipeline]), - SDG([freeform_skill_pipeline]), - SDG([grounded_skill_pipeline]), + SDG([Pipeline.from_flows(ctx, knowledge_flows)]), + SDG([Pipeline.from_flows(ctx, freeform_skill_flows)]), + SDG([Pipeline.from_flows(ctx, grounded_skill_flows)]), ) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 83e88621..40304277 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -126,12 +126,20 @@ def _parse(self, generated_string) -> dict: def _format_prompt(self, sample: Dict) -> str: return self.prompt_template.format(**sample).strip() + def _gen_kwargs(self, **gen_kwargs): + gen_kwargs = {**self.defaults, **gen_kwargs} + if "max_tokens" in gen_kwargs: + gen_kwargs["max_tokens"] = int(gen_kwargs["max_tokens"]) + if "temperature" in gen_kwargs: + gen_kwargs["temperature"] = float(gen_kwargs["temperature"]) + return gen_kwargs + def _generate(self, samples, **gen_kwargs) -> list: prompts = [ self.model_prompt.format(prompt=self._format_prompt(sample)) for sample in samples ] - generate_args = {**self.defaults, **gen_kwargs} + generate_args = self._gen_kwargs(**gen_kwargs) if self.server_supports_batched: response = self.ctx.client.completions.create( diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index a9db1970..74f58ee7 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # Standard from importlib import resources +import os.path # Third Party from datasets import Dataset +import yaml # Local +from . import filterblock, llmblock, utilblocks from .logger_config import setup_logger logger = setup_logger(__name__) @@ -35,6 +38,15 @@ def __init__(self, ctx, chained_blocks: list) -> None: # pipeline config is the run configuration that consists of the pipeline steps self.chained_blocks = chained_blocks + @classmethod + def from_flows(cls, ctx, flows): + block_configs = [] + for flow_path in flows: + if not os.path.isabs(flow_path): + flow_path = os.path.join(ctx.sdg_base, flow_path) + block_configs.extend(parse_flow_config_file(flow_path)) + return cls(ctx, block_configs) + def _drop_duplicates(self, dataset, cols): """ Drop duplicates from the dataset based on the columns provided. @@ -49,7 +61,7 @@ def generate(self, dataset) -> Dataset: dataset: the input dataset """ for block_prop in self.chained_blocks: - block_type = block_prop["block_type"] + block_type = _lookup_block_type(block_prop["block_type"]) block_config = block_prop["block_config"] drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) @@ -69,3 +81,60 @@ def generate(self, dataset) -> Dataset: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) return dataset + + +_block_types = { + "CombineColumnsBlock": utilblocks.CombineColumnsBlock, + "ConditionalLLMBlock": llmblock.ConditionalLLMBlock, + "FilterByValueBlock": filterblock.FilterByValueBlock, + "LLMBlock": llmblock.LLMBlock, + "SamplePopulatorBlock": utilblocks.SamplePopulatorBlock, + "SelectorBlock": utilblocks.SelectorBlock, +} + + +def _lookup_block_type(block_type): + if not block_type in _block_types: + raise FlowParserError("Unknown block type {block_type}") + return _block_types[block_type] + + +_FLOW_PARSER_MAJOR = 1 +_FLOW_PARSER_MINOR = 0 + + +class FlowParserError(Exception): + """An exception raised while parsing a flow config file.""" + + +def parse_flow_config_file(flow_path): + with open(flow_path, "r", encoding="utf-8") as flow_file: + content = yaml.safe_load(flow_file) + + version = content["version"] + major, minor = map(int, version.split(".")) + + if major > _FLOW_PARSER_MAJOR: + raise FlowParserError( + "The custom flow file format is from a future major version." + ) + if major <= _FLOW_PARSER_MAJOR and minor > _FLOW_PARSER_MINOR: + logger.warning( + "The custom flow file may have new features that will be ignored." + ) + + if not "block_configs" in content: + raise FlowParserError( + "The custom flow file contains no 'block_configs' section" + ) + + return content["block_configs"] + + +MMLU_BENCH_FLOW = "flows/mmlu_bench.yaml" +SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml" +SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml" +SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml" +SYNTH_FREEFORM_SKILLS_FLOW = "flows/synth_freeform_skills.yaml" +SYNTH_GROUNDED_SKILLS_FLOW = "flows/synth_grounded_skills.yaml" +SYNTH_KNOWLEDGE_FLOW = "flows/synth_knowledge.yaml" diff --git a/tests/test_default_flows.py b/tests/test_default_flows.py new file mode 100644 index 00000000..b20394a9 --- /dev/null +++ b/tests/test_default_flows.py @@ -0,0 +1,49 @@ +# Standard +from importlib import resources +from unittest.mock import patch +import unittest + +# Third Party +from datasets import Dataset + +# First Party +from instructlab.sdg.filterblock import FilterByValueBlock +from instructlab.sdg.llmblock import ConditionalLLMBlock, LLMBlock +from instructlab.sdg.pipeline import Pipeline, PipelineContext +from instructlab.sdg.utilblocks import ( + CombineColumnsBlock, + SamplePopulatorBlock, + SelectorBlock, +) + + +def _noop_generate(self, samples, **gen_kwargs): + return samples + + +@patch.object(CombineColumnsBlock, "generate", _noop_generate) +@patch.object(ConditionalLLMBlock, "generate", _noop_generate) +@patch.object(FilterByValueBlock, "generate", _noop_generate) +@patch.object(LLMBlock, "generate", _noop_generate) +@patch.object(SamplePopulatorBlock, "generate", _noop_generate) +@patch.object(SelectorBlock, "generate", _noop_generate) +@patch("instructlab.sdg.llmblock.server_supports_batched", lambda c, m: True) +class TestDefaultFlows(unittest.TestCase): + def setUp(self): + self._yaml_files = [ + file + for file in resources.files("instructlab.sdg.flows").iterdir() + if file.suffix == ".yaml" + ] + + def test_pipeline_from_flows(self): + ctx = PipelineContext( + client=None, + model_family="mixtral", + model_id="model", + num_instructions_to_generate=1, + ) + for flow_path in self._yaml_files: + pipeline = Pipeline.from_flows(ctx, [flow_path]) + output = pipeline.generate(Dataset.from_list([])) + self.assertIsNotNone(output) diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py index 5e00c80b..cec4eff5 100644 --- a/tests/test_filterblock.py +++ b/tests/test_filterblock.py @@ -15,19 +15,19 @@ class TestFilterByValueBlock(unittest.TestCase): def setUp(self): self.block = FilterByValueBlock( PipelineContext(None, None, None, None), - block_name="filter_by_age", + "filter_by_age", filter_column="age", - filter_value=30, - operation=operator.eq, - convert_dtype=int, + filter_value="30", + operation="eq", + convert_dtype="int", ) self.block_with_list = FilterByValueBlock( PipelineContext(None, None, None, None), - block_name="filter_by_ages", + "filter_by_age_list", filter_column="age", - filter_value=[30, 35], - operation=operator.eq, - convert_dtype=int, + filter_value=["30", "35"], + operation="eq", + convert_dtype="int", ) self.dataset = Dataset.from_dict( {"age": ["25", "30", "35", "forty", "45"]}, From 003c8e81d56f63137074e6ed331e1ca1b452b3dc Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 11:41:00 +0100 Subject: [PATCH 12/28] Merge mmlu_bench block into synth_knowledge pipeline This is the only case where a pipeline is split across multiple files, and it doesn't serve much value, so let's combine. Signed-off-by: Mark McLoughlin --- scripts/test_knowledge.py | 3 +-- src/instructlab/sdg/flows/mmlu_bench.yaml | 14 -------------- src/instructlab/sdg/flows/synth_knowledge.yaml | 12 ++++++++++++ src/instructlab/sdg/generate_data.py | 2 -- src/instructlab/sdg/pipeline.py | 1 - 5 files changed, 13 insertions(+), 19 deletions(-) delete mode 100644 src/instructlab/sdg/flows/mmlu_bench.yaml diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 32747dc1..b5007e32 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -8,7 +8,6 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - MMLU_BENCH_FLOW, SYNTH_KNOWLEDGE_FLOW, Pipeline, PipelineContext, @@ -44,7 +43,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -knowledge_pipe = Pipeline.from_flows(ctx, [MMLU_BENCH_FLOW, SYNTH_KNOWLEDGE_FLOW]) +knowledge_pipe = Pipeline.from_flows(ctx, [SYNTH_KNOWLEDGE_FLOW]) sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) diff --git a/src/instructlab/sdg/flows/mmlu_bench.yaml b/src/instructlab/sdg/flows/mmlu_bench.yaml deleted file mode 100644 index 0555e0a9..00000000 --- a/src/instructlab/sdg/flows/mmlu_bench.yaml +++ /dev/null @@ -1,14 +0,0 @@ -version: "1.0" -block_configs: - - block_type: LLMBlock - block_config: - block_name: gen_mmlu_knowledge - config_path: configs/knowledge/mcq_generation.yaml - output_cols: - - mmlubench_question - - mmlubench_answer - gen_kwargs: - temperature: 0 - max_tokens: 2048 - drop_duplicates: - - mmlubench_question diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/flows/synth_knowledge.yaml index dcb0d9cc..164c01b7 100644 --- a/src/instructlab/sdg/flows/synth_knowledge.yaml +++ b/src/instructlab/sdg/flows/synth_knowledge.yaml @@ -1,5 +1,17 @@ version: "1.0" block_configs: + - block_type: LLMBlock + block_config: + block_name: gen_mmlu_knowledge + config_path: configs/knowledge/mcq_generation.yaml + output_cols: + - mmlubench_question + - mmlubench_answer + gen_kwargs: + temperature: 0 + max_tokens: 2048 + drop_duplicates: + - mmlubench_question - block_type: LLMBlock block_config: block_name: gen_knowledge diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index dc2754b6..97e08673 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -19,7 +19,6 @@ from instructlab.sdg import SDG, utils from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL from instructlab.sdg.pipeline import ( - MMLU_BENCH_FLOW, SIMPLE_FREEFORM_SKILLS_FLOW, SIMPLE_GROUNDED_SKILLS_FLOW, SIMPLE_KNOWLEDGE_FLOW, @@ -173,7 +172,6 @@ def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_gene freeform_skill_flows = [] grounded_skill_flows = [] if pipeline == "full": - knowledge_flows.append(MMLU_BENCH_FLOW) knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW) freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW) grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 74f58ee7..55602ec9 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -131,7 +131,6 @@ def parse_flow_config_file(flow_path): return content["block_configs"] -MMLU_BENCH_FLOW = "flows/mmlu_bench.yaml" SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml" SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml" SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml" From ab465526d9c97d8ee85985835ea12f05e8169a6a Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 11:55:40 +0100 Subject: [PATCH 13/28] Rename Pipeline.from_flows() to Pipeline.from_file() Now that we don't have a use case for instantiating a pipeline from multiple files, simplify the interface. Also, Pipeline.from_file() is more direct about what it does -- it loads a pipeline from a yaml configuration file. Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 4 +- scripts/test_grounded_skills.py | 4 +- scripts/test_knowledge.py | 4 +- src/instructlab/sdg/generate_data.py | 36 ++++++------- src/instructlab/sdg/pipeline.py | 53 +++++++++---------- ...ws.py => test_default_pipeline_configs.py} | 8 +-- 6 files changed, 53 insertions(+), 56 deletions(-) rename tests/{test_default_flows.py => test_default_pipeline_configs.py} (87%) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 4c120264..dbb7a1bb 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - SYNTH_FREEFORM_SKILLS_FLOW, + SYNTH_FREEFORM_SKILLS_FILE, Pipeline, PipelineContext, ) @@ -54,7 +54,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -skills_pipe = Pipeline.from_flows(ctx, [SYNTH_FREEFORM_SKILLS_FLOW]) +skills_pipe = Pipeline.from_file(ctx, SYNTH_FREEFORM_SKILLS_FILE) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 63aa2fcd..f27b8d42 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - SYNTH_GROUNDED_SKILLS_FLOW, + SYNTH_GROUNDED_SKILLS_FILE, Pipeline, PipelineContext, ) @@ -102,7 +102,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 10) -skills_pipe = Pipeline.from_flows(ctx, [SYNTH_GROUNDED_SKILLS_FLOW]) +skills_pipe = Pipeline.from_file(ctx, SYNTH_GROUNDED_SKILLS_FILE) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index b5007e32..8dc6bead 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -8,7 +8,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - SYNTH_KNOWLEDGE_FLOW, + SYNTH_KNOWLEDGE_FILE, Pipeline, PipelineContext, ) @@ -43,7 +43,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -knowledge_pipe = Pipeline.from_flows(ctx, [SYNTH_KNOWLEDGE_FLOW]) +knowledge_pipe = Pipeline.from_file(ctx, SYNTH_KNOWLEDGE_FILE) sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 97e08673..d357ec34 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -19,12 +19,12 @@ from instructlab.sdg import SDG, utils from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL from instructlab.sdg.pipeline import ( - SIMPLE_FREEFORM_SKILLS_FLOW, - SIMPLE_GROUNDED_SKILLS_FLOW, - SIMPLE_KNOWLEDGE_FLOW, - SYNTH_FREEFORM_SKILLS_FLOW, - SYNTH_GROUNDED_SKILLS_FLOW, - SYNTH_KNOWLEDGE_FLOW, + SIMPLE_FREEFORM_SKILLS_FILE, + SIMPLE_GROUNDED_SKILLS_FILE, + SIMPLE_KNOWLEDGE_FILE, + SYNTH_FREEFORM_SKILLS_FILE, + SYNTH_GROUNDED_SKILLS_FILE, + SYNTH_KNOWLEDGE_FILE, Pipeline, PipelineContext, ) @@ -168,26 +168,26 @@ def _gen_test_data( def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate): - knowledge_flows = [] - freeform_skill_flows = [] - grounded_skill_flows = [] + knowledge_yaml = None + freeform_skills_yaml = None + grounded_skills_yaml = None if pipeline == "full": - knowledge_flows.append(SYNTH_KNOWLEDGE_FLOW) - freeform_skill_flows.append(SYNTH_FREEFORM_SKILLS_FLOW) - grounded_skill_flows.append(SYNTH_GROUNDED_SKILLS_FLOW) + knowledge_yaml = SYNTH_KNOWLEDGE_FILE + freeform_skills_yaml = SYNTH_FREEFORM_SKILLS_FILE + grounded_skills_yaml = SYNTH_GROUNDED_SKILLS_FILE elif pipeline == "simple": - knowledge_flows.append(SIMPLE_KNOWLEDGE_FLOW) - freeform_skill_flows.append(SIMPLE_FREEFORM_SKILLS_FLOW) - grounded_skill_flows.append(SIMPLE_GROUNDED_SKILLS_FLOW) + knowledge_yaml = SIMPLE_KNOWLEDGE_FILE + freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE + grounded_skills_yaml = SIMPLE_GROUNDED_SKILLS_FILE else: raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.") ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate) return ( - SDG([Pipeline.from_flows(ctx, knowledge_flows)]), - SDG([Pipeline.from_flows(ctx, freeform_skill_flows)]), - SDG([Pipeline.from_flows(ctx, grounded_skill_flows)]), + SDG([Pipeline.from_file(ctx, knowledge_yaml)]), + SDG([Pipeline.from_file(ctx, freeform_skills_yaml)]), + SDG([Pipeline.from_file(ctx, grounded_skills_yaml)]), ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 55602ec9..cb3482d3 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -39,13 +39,10 @@ def __init__(self, ctx, chained_blocks: list) -> None: self.chained_blocks = chained_blocks @classmethod - def from_flows(cls, ctx, flows): - block_configs = [] - for flow_path in flows: - if not os.path.isabs(flow_path): - flow_path = os.path.join(ctx.sdg_base, flow_path) - block_configs.extend(parse_flow_config_file(flow_path)) - return cls(ctx, block_configs) + def from_file(cls, ctx, pipeline_yaml): + if not os.path.isabs(pipeline_yaml): + pipeline_yaml = os.path.join(ctx.sdg_base, pipeline_yaml) + return cls(ctx, _parse_pipeline_config_file(pipeline_yaml)) def _drop_duplicates(self, dataset, cols): """ @@ -95,45 +92,45 @@ def generate(self, dataset) -> Dataset: def _lookup_block_type(block_type): if not block_type in _block_types: - raise FlowParserError("Unknown block type {block_type}") + raise PipelineConfigParserError("Unknown block type {block_type}") return _block_types[block_type] -_FLOW_PARSER_MAJOR = 1 -_FLOW_PARSER_MINOR = 0 +_PIPELINE_CONFIG_PARSER_MAJOR = 1 +_PIPELINE_CONFIG_PARSER_MINOR = 0 -class FlowParserError(Exception): - """An exception raised while parsing a flow config file.""" +class PipelineConfigParserError(Exception): + """An exception raised while parsing a pipline config file.""" -def parse_flow_config_file(flow_path): - with open(flow_path, "r", encoding="utf-8") as flow_file: - content = yaml.safe_load(flow_file) +def _parse_pipeline_config_file(pipeline_yaml): + with open(pipeline_yaml, "r", encoding="utf-8") as pipeline_file: + content = yaml.safe_load(pipeline_file) version = content["version"] major, minor = map(int, version.split(".")) - if major > _FLOW_PARSER_MAJOR: - raise FlowParserError( - "The custom flow file format is from a future major version." + if major > _PIPELINE_CONFIG_PARSER_MAJOR: + raise PipelineConfigParserError( + "The pipeline config file format is from a future major version." ) - if major <= _FLOW_PARSER_MAJOR and minor > _FLOW_PARSER_MINOR: + if major <= _PIPELINE_CONFIG_PARSER_MAJOR and minor > _PIPELINE_CONFIG_PARSER_MINOR: logger.warning( - "The custom flow file may have new features that will be ignored." + "The pipeline config file may have new features that will be ignored." ) if not "block_configs" in content: - raise FlowParserError( - "The custom flow file contains no 'block_configs' section" + raise PipelineConfigParserError( + "The pipeline config file contains no 'block_configs' section" ) return content["block_configs"] -SIMPLE_FREEFORM_SKILLS_FLOW = "flows/simple_freeform_skills.yaml" -SIMPLE_GROUNDED_SKILLS_FLOW = "flows/simple_grounded_skills.yaml" -SIMPLE_KNOWLEDGE_FLOW = "flows/simple_knowledge.yaml" -SYNTH_FREEFORM_SKILLS_FLOW = "flows/synth_freeform_skills.yaml" -SYNTH_GROUNDED_SKILLS_FLOW = "flows/synth_grounded_skills.yaml" -SYNTH_KNOWLEDGE_FLOW = "flows/synth_knowledge.yaml" +SIMPLE_FREEFORM_SKILLS_FILE = "flows/simple_freeform_skills.yaml" +SIMPLE_GROUNDED_SKILLS_FILE = "flows/simple_grounded_skills.yaml" +SIMPLE_KNOWLEDGE_FILE = "flows/simple_knowledge.yaml" +SYNTH_FREEFORM_SKILLS_FILE = "flows/synth_freeform_skills.yaml" +SYNTH_GROUNDED_SKILLS_FILE = "flows/synth_grounded_skills.yaml" +SYNTH_KNOWLEDGE_FILE = "flows/synth_knowledge.yaml" diff --git a/tests/test_default_flows.py b/tests/test_default_pipeline_configs.py similarity index 87% rename from tests/test_default_flows.py rename to tests/test_default_pipeline_configs.py index b20394a9..09b8a9f4 100644 --- a/tests/test_default_flows.py +++ b/tests/test_default_pipeline_configs.py @@ -28,7 +28,7 @@ def _noop_generate(self, samples, **gen_kwargs): @patch.object(SamplePopulatorBlock, "generate", _noop_generate) @patch.object(SelectorBlock, "generate", _noop_generate) @patch("instructlab.sdg.llmblock.server_supports_batched", lambda c, m: True) -class TestDefaultFlows(unittest.TestCase): +class TestDefaultPipelineConfigs(unittest.TestCase): def setUp(self): self._yaml_files = [ file @@ -36,14 +36,14 @@ def setUp(self): if file.suffix == ".yaml" ] - def test_pipeline_from_flows(self): + def test_pipeline_from_config(self): ctx = PipelineContext( client=None, model_family="mixtral", model_id="model", num_instructions_to_generate=1, ) - for flow_path in self._yaml_files: - pipeline = Pipeline.from_flows(ctx, [flow_path]) + for pipeline_yaml in self._yaml_files: + pipeline = Pipeline.from_file(ctx, pipeline_yaml) output = pipeline.generate(Dataset.from_list([])) self.assertIsNotNone(output) From beabbf3bc96a58de9d094d74e0ee23c4790ae7b9 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 12:12:38 +0100 Subject: [PATCH 14/28] Move pipeline configs into a new directory structure We want an alias like "full" to refer to a directory containing knowledge.yaml, grounded_skills.yaml, and freeform_skills.yaml. Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 4 ++-- scripts/test_grounded_skills.py | 4 ++-- scripts/test_knowledge.py | 8 ++------ src/instructlab/sdg/generate_data.py | 12 ++++++------ src/instructlab/sdg/pipeline.py | 12 ++++++------ src/instructlab/sdg/{flows => pipelines}/__init__.py | 0 src/instructlab/sdg/pipelines/full/__init__.py | 0 .../full/freeform_skills.yaml} | 0 .../full/grounded_skills.yaml} | 0 .../full/knowledge.yaml} | 0 src/instructlab/sdg/pipelines/simple/__init__.py | 0 .../simple/freeform_skills.yaml} | 0 .../simple/grounded_skills.yaml} | 0 .../simple/knowledge.yaml} | 0 tests/test_default_pipeline_configs.py | 6 +++++- 15 files changed, 23 insertions(+), 23 deletions(-) rename src/instructlab/sdg/{flows => pipelines}/__init__.py (100%) create mode 100644 src/instructlab/sdg/pipelines/full/__init__.py rename src/instructlab/sdg/{flows/synth_freeform_skills.yaml => pipelines/full/freeform_skills.yaml} (100%) rename src/instructlab/sdg/{flows/synth_grounded_skills.yaml => pipelines/full/grounded_skills.yaml} (100%) rename src/instructlab/sdg/{flows/synth_knowledge.yaml => pipelines/full/knowledge.yaml} (100%) create mode 100644 src/instructlab/sdg/pipelines/simple/__init__.py rename src/instructlab/sdg/{flows/simple_freeform_skills.yaml => pipelines/simple/freeform_skills.yaml} (100%) rename src/instructlab/sdg/{flows/simple_grounded_skills.yaml => pipelines/simple/grounded_skills.yaml} (100%) rename src/instructlab/sdg/{flows/simple_knowledge.yaml => pipelines/simple/knowledge.yaml} (100%) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index dbb7a1bb..45f5f15b 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - SYNTH_FREEFORM_SKILLS_FILE, + FULL_FREEFORM_SKILLS_FILE, Pipeline, PipelineContext, ) @@ -54,7 +54,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -skills_pipe = Pipeline.from_file(ctx, SYNTH_FREEFORM_SKILLS_FILE) +skills_pipe = Pipeline.from_file(ctx, FULL_FREEFORM_SKILLS_FILE) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index f27b8d42..d229f2b5 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -5,7 +5,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - SYNTH_GROUNDED_SKILLS_FILE, + FULL_GROUNDED_SKILLS_FILE, Pipeline, PipelineContext, ) @@ -102,7 +102,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 10) -skills_pipe = Pipeline.from_file(ctx, SYNTH_GROUNDED_SKILLS_FILE) +skills_pipe = Pipeline.from_file(ctx, FULL_GROUNDED_SKILLS_FILE) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 8dc6bead..2f207549 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -7,11 +7,7 @@ # First Party from src.instructlab.sdg import SDG -from src.instructlab.sdg.pipeline import ( - SYNTH_KNOWLEDGE_FILE, - Pipeline, - PipelineContext, -) +from src.instructlab.sdg.pipeline import FULL_KNOWLEDGE_FILE, Pipeline, PipelineContext # Please don't add you vLLM endpoint key here openai_api_key = "EMPTY" @@ -43,7 +39,7 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -knowledge_pipe = Pipeline.from_file(ctx, SYNTH_KNOWLEDGE_FILE) +knowledge_pipe = Pipeline.from_file(ctx, FULL_KNOWLEDGE_FILE) sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index d357ec34..bdfa63c7 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -19,12 +19,12 @@ from instructlab.sdg import SDG, utils from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL from instructlab.sdg.pipeline import ( + FULL_FREEFORM_SKILLS_FILE, + FULL_GROUNDED_SKILLS_FILE, + FULL_KNOWLEDGE_FILE, SIMPLE_FREEFORM_SKILLS_FILE, SIMPLE_GROUNDED_SKILLS_FILE, SIMPLE_KNOWLEDGE_FILE, - SYNTH_FREEFORM_SKILLS_FILE, - SYNTH_GROUNDED_SKILLS_FILE, - SYNTH_KNOWLEDGE_FILE, Pipeline, PipelineContext, ) @@ -172,9 +172,9 @@ def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_gene freeform_skills_yaml = None grounded_skills_yaml = None if pipeline == "full": - knowledge_yaml = SYNTH_KNOWLEDGE_FILE - freeform_skills_yaml = SYNTH_FREEFORM_SKILLS_FILE - grounded_skills_yaml = SYNTH_GROUNDED_SKILLS_FILE + knowledge_yaml = FULL_KNOWLEDGE_FILE + freeform_skills_yaml = FULL_FREEFORM_SKILLS_FILE + grounded_skills_yaml = FULL_GROUNDED_SKILLS_FILE elif pipeline == "simple": knowledge_yaml = SIMPLE_KNOWLEDGE_FILE freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index cb3482d3..652cb472 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -128,9 +128,9 @@ def _parse_pipeline_config_file(pipeline_yaml): return content["block_configs"] -SIMPLE_FREEFORM_SKILLS_FILE = "flows/simple_freeform_skills.yaml" -SIMPLE_GROUNDED_SKILLS_FILE = "flows/simple_grounded_skills.yaml" -SIMPLE_KNOWLEDGE_FILE = "flows/simple_knowledge.yaml" -SYNTH_FREEFORM_SKILLS_FILE = "flows/synth_freeform_skills.yaml" -SYNTH_GROUNDED_SKILLS_FILE = "flows/synth_grounded_skills.yaml" -SYNTH_KNOWLEDGE_FILE = "flows/synth_knowledge.yaml" +SIMPLE_FREEFORM_SKILLS_FILE = "pipelines/simple/freeform_skills.yaml" +SIMPLE_GROUNDED_SKILLS_FILE = "pipelines/simple/grounded_skills.yaml" +SIMPLE_KNOWLEDGE_FILE = "pipelines/simple/knowledge.yaml" +FULL_FREEFORM_SKILLS_FILE = "pipelines/full/freeform_skills.yaml" +FULL_GROUNDED_SKILLS_FILE = "piplines/full/synth_grounded_skills.yaml" +FULL_KNOWLEDGE_FILE = "pipelines/full/synth_knowledge.yaml" diff --git a/src/instructlab/sdg/flows/__init__.py b/src/instructlab/sdg/pipelines/__init__.py similarity index 100% rename from src/instructlab/sdg/flows/__init__.py rename to src/instructlab/sdg/pipelines/__init__.py diff --git a/src/instructlab/sdg/pipelines/full/__init__.py b/src/instructlab/sdg/pipelines/full/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/flows/synth_freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml similarity index 100% rename from src/instructlab/sdg/flows/synth_freeform_skills.yaml rename to src/instructlab/sdg/pipelines/full/freeform_skills.yaml diff --git a/src/instructlab/sdg/flows/synth_grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml similarity index 100% rename from src/instructlab/sdg/flows/synth_grounded_skills.yaml rename to src/instructlab/sdg/pipelines/full/grounded_skills.yaml diff --git a/src/instructlab/sdg/flows/synth_knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml similarity index 100% rename from src/instructlab/sdg/flows/synth_knowledge.yaml rename to src/instructlab/sdg/pipelines/full/knowledge.yaml diff --git a/src/instructlab/sdg/pipelines/simple/__init__.py b/src/instructlab/sdg/pipelines/simple/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/instructlab/sdg/flows/simple_freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml similarity index 100% rename from src/instructlab/sdg/flows/simple_freeform_skills.yaml rename to src/instructlab/sdg/pipelines/simple/freeform_skills.yaml diff --git a/src/instructlab/sdg/flows/simple_grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml similarity index 100% rename from src/instructlab/sdg/flows/simple_grounded_skills.yaml rename to src/instructlab/sdg/pipelines/simple/grounded_skills.yaml diff --git a/src/instructlab/sdg/flows/simple_knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml similarity index 100% rename from src/instructlab/sdg/flows/simple_knowledge.yaml rename to src/instructlab/sdg/pipelines/simple/knowledge.yaml diff --git a/tests/test_default_pipeline_configs.py b/tests/test_default_pipeline_configs.py index 09b8a9f4..211cf4de 100644 --- a/tests/test_default_pipeline_configs.py +++ b/tests/test_default_pipeline_configs.py @@ -32,7 +32,11 @@ class TestDefaultPipelineConfigs(unittest.TestCase): def setUp(self): self._yaml_files = [ file - for file in resources.files("instructlab.sdg.flows").iterdir() + for package in [ + "instructlab.sdg.pipelines.simple", + "instructlab.sdg.pipelines.full", + ] + for file in resources.files(package).iterdir() if file.suffix == ".yaml" ] From ec9415929fce129cf37e2603203782244ce83a6e Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 12:21:06 +0100 Subject: [PATCH 15/28] YAML format improvement - move block_name up The primary identifier for a block is its name, so it makes sense at this level rather than under block_config ``` - block_name: gen_questions block_type: LLMBlock block_config: config_path: configs/skills/freeform_questions.yaml ``` rather than ``` - block_type: LLMBlock block_config: block_name: gen_questions config_path: configs/skills/freeform_questions.yaml ``` Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/pipeline.py | 5 +-- .../sdg/pipelines/full/freeform_skills.yaml | 24 +++++++------- .../sdg/pipelines/full/grounded_skills.yaml | 32 +++++++++---------- .../sdg/pipelines/full/knowledge.yaml | 32 +++++++++---------- .../sdg/pipelines/simple/freeform_skills.yaml | 4 +-- .../sdg/pipelines/simple/grounded_skills.yaml | 4 +-- .../sdg/pipelines/simple/knowledge.yaml | 4 +-- 7 files changed, 53 insertions(+), 52 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 652cb472..058f2c20 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -58,14 +58,15 @@ def generate(self, dataset) -> Dataset: dataset: the input dataset """ for block_prop in self.chained_blocks: + block_name = block_prop["block_name"] block_type = _lookup_block_type(block_prop["block_type"]) block_config = block_prop["block_config"] drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, **block_config) + block = block_type(self.ctx, block_name, **block_config) - logger.info("Running block: %s", block_config["block_name"]) + logger.info("Running block: %s", block_name) logger.info(dataset) dataset = block.generate(dataset, **gen_kwargs) diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index 885ccad3..53d3667b 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -1,24 +1,24 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_questions + block_type: LLMBlock block_config: - block_name: gen_questions config_path: configs/skills/freeform_questions.yaml add_num_samples: True output_cols: - question drop_duplicates: - question - - block_type: LLMBlock + - block_name: eval_questions + block_type: LLMBlock block_config: - block_name: eval_questions config_path: configs/skills/evaluate_freeform_questions.yaml output_cols: - evaluation - score - - block_type: FilterByValueBlock + - block_name: filter_questions + block_type: FilterByValueBlock block_config: - block_name: filter_questions filter_column: score filter_value: 1.0 operation: eq @@ -27,22 +27,22 @@ block_configs: - evaluation - score - num_samples - - block_type: LLMBlock + - block_name: gen_responses + block_type: LLMBlock block_config: - block_name: gen_responses config_path: configs/skills/freeform_responses.yaml output_cols: - response - - block_type: LLMBlock + - block_name: evaluate_qa_pair + block_type: LLMBlock block_config: - block_name: evaluate_qa_pair config_path: configs/skills/evaluate_freeform_pair.yaml output_cols: - evaluation - score - - block_type: FilterByValueBlock + - block_name: filter_qa_pair + block_type: FilterByValueBlock block_config: - block_name: filter_qa_pair filter_column: score filter_value: 2.0 operation: ge diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index 7aa9c0c7..bb94ea1d 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -1,33 +1,33 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_contexts + block_type: LLMBlock block_config: - block_name: gen_contexts config_path: configs/skills/contexts.yaml output_cols: - context gen_kwargs: temperature: 0.7 max_tokens: 2048 - - block_type: LLMBlock + - block_name: gen_grounded_questions + block_type: LLMBlock block_config: - block_name: gen_grounded_questions config_path: configs/skills/grounded_questions.yaml add_num_samples: True output_cols: - question drop_duplicates: - question - - block_type: LLMBlock + - block_name: eval_grounded_questions + block_type: LLMBlock block_config: - block_name: eval_grounded_questions config_path: configs/skills/evaluate_grounded_questions.yaml output_cols: - evaluation - score - - block_type: FilterByValueBlock + - block_name: filter_grounded_questions + block_type: FilterByValueBlock block_config: - block_name: filter_grounded_questions filter_column: score filter_value: 1.0 operation: eq @@ -36,29 +36,29 @@ block_configs: - evaluation - score - num_samples - - block_type: LLMBlock + - block_name: gen_grounded_responses + block_type: LLMBlock block_config: - block_name: gen_grounded_responses config_path: configs/skills/grounded_responses.yaml output_cols: - response - - block_type: LLMBlock + - block_name: evaluate_grounded_qa_pair + block_type: LLMBlock block_config: - block_name: evaluate_grounded_qa_pair config_path: configs/skills/evaluate_grounded_pair.yaml output_cols: - evaluation - score - - block_type: FilterByValueBlock + - block_name: filter_grounded_qa_pair + block_type: FilterByValueBlock block_config: - block_name: filter_grounded_qa_pair filter_column: score filter_value: 2.0 operation: ge convert_dtype: float - - block_type: CombineColumnsBlock + - block_name: combine_question_and_context + block_type: CombineColumnsBlock block_config: - block_name: combine_question_and_context columns: - context - question diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index 164c01b7..5a1af3e6 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_mmlu_knowledge + block_type: LLMBlock block_config: - block_name: gen_mmlu_knowledge config_path: configs/knowledge/mcq_generation.yaml output_cols: - mmlubench_question @@ -12,9 +12,9 @@ block_configs: max_tokens: 2048 drop_duplicates: - mmlubench_question - - block_type: LLMBlock + - block_name: gen_knowledge + block_type: LLMBlock block_config: - block_name: gen_knowledge config_path: configs/knowledge/generate_questions_responses.yaml output_cols: - question @@ -28,36 +28,36 @@ block_configs: max_tokens: 2048 drop_duplicates: - question - - block_type: LLMBlock + - block_name: eval_faithfulness_qa_pair + block_type: LLMBlock block_config: - block_name: eval_faithfulness_qa_pair config_path: configs/knowledge/evaluate_faithfulness.yaml output_cols: - explanation - judgment gen_kwargs: max_tokens: 2048 - - block_type: FilterByValueBlock + - block_name: filter_faithfulness + block_type: FilterByValueBlock block_config: - block_name: filter_faithfulness filter_column: judgment filter_value: YES operation: eq drop_columns: - judgment - explanation - - block_type: LLMBlock + - block_name: eval_relevancy_qa_pair + block_type: LLMBlock block_config: - block_name: eval_relevancy_qa_pair config_path: configs/knowledge/evaluate_relevancy.yaml output_cols: - feedback - score gen_kwargs: max_tokens: 2048 - - block_type: FilterByValueBlock + - block_name: filter_relevancy + block_type: FilterByValueBlock block_config: - block_name: filter_relevancy filter_column: score filter_value: 2.0 operation: eq @@ -65,18 +65,18 @@ block_configs: drop_columns: - feedback - score - - block_type: LLMBlock + - block_name: eval_verify_question + block_type: LLMBlock block_config: - block_name: eval_verify_question config_path: configs/knowledge/evaluate_question.yaml output_cols: - explanation - rating gen_kwargs: max_tokens: 2048 - - block_type: FilterByValueBlock + - block_name: filter_verify_question + block_type: FilterByValueBlock block_config: - block_name: filter_verify_question filter_column: rating filter_value: 1.0 operation: eq diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml index deac2875..31d141b4 100644 --- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_skill_freeform + block_type: LLMBlock block_config: - block_name: gen_skill_freeform config_path: configs/skills/simple_generate_qa_freeform.yaml output_cols: - output diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index f20c3784..5804f5ce 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_skill_grounded + block_type: LLMBlock block_config: - block_name: gen_skill_grounded config_path: configs/skills/simple_generate_qa_grounded.yaml output_cols: - output diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml index 3243faf5..8fcf4807 100644 --- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_type: LLMBlock + - block_name: gen_knowledge + block_type: LLMBlock block_config: - block_name: gen_knowledge config_path: configs/knowledge/simple_generate_qa.yaml output_cols: - output From 2d92cf69875d7cb4df4f8c08f39ff8f34f56a542 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 12:26:01 +0100 Subject: [PATCH 16/28] YAML format improvement - remove block_ prefix Under a blocks: list, the block_ prefix adds no value to the name, type, and config fields: ``` blocks: - name: gen_questions type: LLMBlock config: config_path: configs/skills/freeform_questions.yaml ``` rather than ``` blocks: - block_name: gen_questions block_type: LLMBlock block_config: config_path: configs/skills/freeform_questions.yaml ``` Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/pipeline.py | 6 +-- .../sdg/pipelines/full/freeform_skills.yaml | 36 +++++++------- .../sdg/pipelines/full/grounded_skills.yaml | 48 +++++++++---------- .../sdg/pipelines/full/knowledge.yaml | 48 +++++++++---------- .../sdg/pipelines/simple/freeform_skills.yaml | 6 +-- .../sdg/pipelines/simple/grounded_skills.yaml | 6 +-- .../sdg/pipelines/simple/knowledge.yaml | 6 +-- 7 files changed, 78 insertions(+), 78 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 058f2c20..274cd3a0 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -58,9 +58,9 @@ def generate(self, dataset) -> Dataset: dataset: the input dataset """ for block_prop in self.chained_blocks: - block_name = block_prop["block_name"] - block_type = _lookup_block_type(block_prop["block_type"]) - block_config = block_prop["block_config"] + block_name = block_prop["name"] + block_type = _lookup_block_type(block_prop["type"]) + block_config = block_prop["config"] drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index 53d3667b..436d4240 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -1,24 +1,24 @@ version: "1.0" block_configs: - - block_name: gen_questions - block_type: LLMBlock - block_config: + - name: gen_questions + type: LLMBlock + config: config_path: configs/skills/freeform_questions.yaml add_num_samples: True output_cols: - question drop_duplicates: - question - - block_name: eval_questions - block_type: LLMBlock - block_config: + - name: eval_questions + type: LLMBlock + config: config_path: configs/skills/evaluate_freeform_questions.yaml output_cols: - evaluation - score - - block_name: filter_questions - block_type: FilterByValueBlock - block_config: + - name: filter_questions + type: FilterByValueBlock + config: filter_column: score filter_value: 1.0 operation: eq @@ -27,22 +27,22 @@ block_configs: - evaluation - score - num_samples - - block_name: gen_responses - block_type: LLMBlock - block_config: + - name: gen_responses + type: LLMBlock + config: config_path: configs/skills/freeform_responses.yaml output_cols: - response - - block_name: evaluate_qa_pair - block_type: LLMBlock - block_config: + - name: evaluate_qa_pair + type: LLMBlock + config: config_path: configs/skills/evaluate_freeform_pair.yaml output_cols: - evaluation - score - - block_name: filter_qa_pair - block_type: FilterByValueBlock - block_config: + - name: filter_qa_pair + type: FilterByValueBlock + config: filter_column: score filter_value: 2.0 operation: ge diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index bb94ea1d..c8a3d939 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -1,33 +1,33 @@ version: "1.0" block_configs: - - block_name: gen_contexts - block_type: LLMBlock - block_config: + - name: gen_contexts + type: LLMBlock + config: config_path: configs/skills/contexts.yaml output_cols: - context gen_kwargs: temperature: 0.7 max_tokens: 2048 - - block_name: gen_grounded_questions - block_type: LLMBlock - block_config: + - name: gen_grounded_questions + type: LLMBlock + config: config_path: configs/skills/grounded_questions.yaml add_num_samples: True output_cols: - question drop_duplicates: - question - - block_name: eval_grounded_questions - block_type: LLMBlock - block_config: + - name: eval_grounded_questions + type: LLMBlock + config: config_path: configs/skills/evaluate_grounded_questions.yaml output_cols: - evaluation - score - - block_name: filter_grounded_questions - block_type: FilterByValueBlock - block_config: + - name: filter_grounded_questions + type: FilterByValueBlock + config: filter_column: score filter_value: 1.0 operation: eq @@ -36,29 +36,29 @@ block_configs: - evaluation - score - num_samples - - block_name: gen_grounded_responses - block_type: LLMBlock - block_config: + - name: gen_grounded_responses + type: LLMBlock + config: config_path: configs/skills/grounded_responses.yaml output_cols: - response - - block_name: evaluate_grounded_qa_pair - block_type: LLMBlock - block_config: + - name: evaluate_grounded_qa_pair + type: LLMBlock + config: config_path: configs/skills/evaluate_grounded_pair.yaml output_cols: - evaluation - score - - block_name: filter_grounded_qa_pair - block_type: FilterByValueBlock - block_config: + - name: filter_grounded_qa_pair + type: FilterByValueBlock + config: filter_column: score filter_value: 2.0 operation: ge convert_dtype: float - - block_name: combine_question_and_context - block_type: CombineColumnsBlock - block_config: + - name: combine_question_and_context + type: CombineColumnsBlock + config: columns: - context - question diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index 5a1af3e6..010cbbe5 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_name: gen_mmlu_knowledge - block_type: LLMBlock - block_config: + - name: gen_mmlu_knowledge + type: LLMBlock + config: config_path: configs/knowledge/mcq_generation.yaml output_cols: - mmlubench_question @@ -12,9 +12,9 @@ block_configs: max_tokens: 2048 drop_duplicates: - mmlubench_question - - block_name: gen_knowledge - block_type: LLMBlock - block_config: + - name: gen_knowledge + type: LLMBlock + config: config_path: configs/knowledge/generate_questions_responses.yaml output_cols: - question @@ -28,36 +28,36 @@ block_configs: max_tokens: 2048 drop_duplicates: - question - - block_name: eval_faithfulness_qa_pair - block_type: LLMBlock - block_config: + - name: eval_faithfulness_qa_pair + type: LLMBlock + config: config_path: configs/knowledge/evaluate_faithfulness.yaml output_cols: - explanation - judgment gen_kwargs: max_tokens: 2048 - - block_name: filter_faithfulness - block_type: FilterByValueBlock - block_config: + - name: filter_faithfulness + type: FilterByValueBlock + config: filter_column: judgment filter_value: YES operation: eq drop_columns: - judgment - explanation - - block_name: eval_relevancy_qa_pair - block_type: LLMBlock - block_config: + - name: eval_relevancy_qa_pair + type: LLMBlock + config: config_path: configs/knowledge/evaluate_relevancy.yaml output_cols: - feedback - score gen_kwargs: max_tokens: 2048 - - block_name: filter_relevancy - block_type: FilterByValueBlock - block_config: + - name: filter_relevancy + type: FilterByValueBlock + config: filter_column: score filter_value: 2.0 operation: eq @@ -65,18 +65,18 @@ block_configs: drop_columns: - feedback - score - - block_name: eval_verify_question - block_type: LLMBlock - block_config: + - name: eval_verify_question + type: LLMBlock + config: config_path: configs/knowledge/evaluate_question.yaml output_cols: - explanation - rating gen_kwargs: max_tokens: 2048 - - block_name: filter_verify_question - block_type: FilterByValueBlock - block_config: + - name: filter_verify_question + type: FilterByValueBlock + config: filter_column: rating filter_value: 1.0 operation: eq diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml index 31d141b4..04491b78 100644 --- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_name: gen_skill_freeform - block_type: LLMBlock - block_config: + - name: gen_skill_freeform + type: LLMBlock + config: config_path: configs/skills/simple_generate_qa_freeform.yaml output_cols: - output diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index 5804f5ce..868d3130 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_name: gen_skill_grounded - block_type: LLMBlock - block_config: + - name: gen_skill_grounded + type: LLMBlock + config: config_path: configs/skills/simple_generate_qa_grounded.yaml output_cols: - output diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml index 8fcf4807..3a1479bc 100644 --- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml @@ -1,8 +1,8 @@ version: "1.0" block_configs: - - block_name: gen_knowledge - block_type: LLMBlock - block_config: + - name: gen_knowledge + type: LLMBlock + config: config_path: configs/knowledge/simple_generate_qa.yaml output_cols: - output From a0c9b806731c83a8aa986778a935d544e6004a7f Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 12:49:36 +0100 Subject: [PATCH 17/28] Make "full" and "simple" aliases to a directory of pipeline configs Signed-off-by: Mark McLoughlin --- scripts/test_freeform_skills.py | 8 ++++++-- scripts/test_grounded_skills.py | 8 ++++++-- scripts/test_knowledge.py | 10 ++++++++-- src/instructlab/sdg/generate_data.py | 30 +++++++++++----------------- src/instructlab/sdg/pipeline.py | 8 ++------ 5 files changed, 34 insertions(+), 30 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 45f5f15b..70dd6cf4 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -1,3 +1,6 @@ +# Standard +from importlib import resources + # Third Party from datasets import Dataset from openai import OpenAI @@ -5,7 +8,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - FULL_FREEFORM_SKILLS_FILE, + FULL_PIPELINES_PACKAGE, Pipeline, PipelineContext, ) @@ -54,7 +57,8 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -skills_pipe = Pipeline.from_file(ctx, FULL_FREEFORM_SKILLS_FILE) +with resources.path(FULL_PIPELINES_PACKAGE, "freeform_skills.yaml") as yaml_path: + skills_pipe = Pipeline.from_file(ctx, yaml_path) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index d229f2b5..5578db56 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -1,3 +1,6 @@ +# Standard +from importlib import resources + # Third Party from datasets import Dataset from openai import OpenAI @@ -5,7 +8,7 @@ # First Party from src.instructlab.sdg import SDG from src.instructlab.sdg.pipeline import ( - FULL_GROUNDED_SKILLS_FILE, + FULL_PIPELINES_PACKAGE, Pipeline, PipelineContext, ) @@ -102,7 +105,8 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 10) -skills_pipe = Pipeline.from_file(ctx, FULL_GROUNDED_SKILLS_FILE) +with resources.path(FULL_PIPELINES_PACKAGE, "grounded_skills.yaml") as yaml_path: + skills_pipe = Pipeline.from_file(ctx, yaml_path) sdg = SDG([skills_pipe]) gen_data = sdg.generate(ds) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 2f207549..fc65a275 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -1,4 +1,5 @@ # Standard +from importlib import resources import operator # Third Party @@ -7,7 +8,11 @@ # First Party from src.instructlab.sdg import SDG -from src.instructlab.sdg.pipeline import FULL_KNOWLEDGE_FILE, Pipeline, PipelineContext +from src.instructlab.sdg.pipeline import ( + FULL_PIPELINES_PACKAGE, + Pipeline, + PipelineContext, +) # Please don't add you vLLM endpoint key here openai_api_key = "EMPTY" @@ -39,7 +44,8 @@ ctx = PipelineContext(client, "mixtral", teacher_model, 1) -knowledge_pipe = Pipeline.from_file(ctx, FULL_KNOWLEDGE_FILE) +with resources.path(FULL_PIPELINES_PACKAGE, "knowledge.yaml") as yaml_path: + knowledge_pipe = Pipeline.from_file(ctx, yaml_path) sdg = SDG([knowledge_pipe]) mmlubench_data = sdg.generate(ds) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index bdfa63c7..3bfb8be9 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -2,6 +2,7 @@ # Standard from datetime import datetime +from importlib import resources from pathlib import Path from typing import Optional import json @@ -19,12 +20,8 @@ from instructlab.sdg import SDG, utils from instructlab.sdg.llmblock import MODEL_FAMILY_MERLINITE, MODEL_FAMILY_MIXTRAL from instructlab.sdg.pipeline import ( - FULL_FREEFORM_SKILLS_FILE, - FULL_GROUNDED_SKILLS_FILE, - FULL_KNOWLEDGE_FILE, - SIMPLE_FREEFORM_SKILLS_FILE, - SIMPLE_GROUNDED_SKILLS_FILE, - SIMPLE_KNOWLEDGE_FILE, + FULL_PIPELINES_PACKAGE, + SIMPLE_PIPELINES_PACKAGE, Pipeline, PipelineContext, ) @@ -168,26 +165,23 @@ def _gen_test_data( def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate): - knowledge_yaml = None - freeform_skills_yaml = None - grounded_skills_yaml = None if pipeline == "full": - knowledge_yaml = FULL_KNOWLEDGE_FILE - freeform_skills_yaml = FULL_FREEFORM_SKILLS_FILE - grounded_skills_yaml = FULL_GROUNDED_SKILLS_FILE + pipeline_pkg = FULL_PIPELINES_PACKAGE elif pipeline == "simple": - knowledge_yaml = SIMPLE_KNOWLEDGE_FILE - freeform_skills_yaml = SIMPLE_FREEFORM_SKILLS_FILE - grounded_skills_yaml = SIMPLE_GROUNDED_SKILLS_FILE + pipeline_pkg = SIMPLE_PIPELINES_PACKAGE else: raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.") ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate) + def load_pipeline(yaml_basename): + with resources.path(pipeline_pkg, yaml_basename) as yaml_path: + return Pipeline.from_file(ctx, yaml_path) + return ( - SDG([Pipeline.from_file(ctx, knowledge_yaml)]), - SDG([Pipeline.from_file(ctx, freeform_skills_yaml)]), - SDG([Pipeline.from_file(ctx, grounded_skills_yaml)]), + SDG([load_pipeline("knowledge.yaml")]), + SDG([load_pipeline("freeform_skills.yaml")]), + SDG([load_pipeline("grounded_skills.yaml")]), ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 274cd3a0..076589f7 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -129,9 +129,5 @@ def _parse_pipeline_config_file(pipeline_yaml): return content["block_configs"] -SIMPLE_FREEFORM_SKILLS_FILE = "pipelines/simple/freeform_skills.yaml" -SIMPLE_GROUNDED_SKILLS_FILE = "pipelines/simple/grounded_skills.yaml" -SIMPLE_KNOWLEDGE_FILE = "pipelines/simple/knowledge.yaml" -FULL_FREEFORM_SKILLS_FILE = "pipelines/full/freeform_skills.yaml" -FULL_GROUNDED_SKILLS_FILE = "piplines/full/synth_grounded_skills.yaml" -FULL_KNOWLEDGE_FILE = "pipelines/full/synth_knowledge.yaml" +SIMPLE_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.simple" +FULL_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.full" From eb2719fe01b18e087d8a4c6db894c402af355436 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 16:46:58 +0100 Subject: [PATCH 18/28] YAML format improvement - change block_configs to blocks This: ``` version: "1.0" blocks: - name: gen_questions type: LLMBlock ... ``` rather than: ``` version: "1.0" block_configs: - name: gen_questions type: LLMBlock ... ``` Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/pipeline.py | 6 +++--- src/instructlab/sdg/pipelines/full/freeform_skills.yaml | 2 +- src/instructlab/sdg/pipelines/full/grounded_skills.yaml | 2 +- src/instructlab/sdg/pipelines/full/knowledge.yaml | 2 +- src/instructlab/sdg/pipelines/simple/freeform_skills.yaml | 2 +- src/instructlab/sdg/pipelines/simple/grounded_skills.yaml | 2 +- src/instructlab/sdg/pipelines/simple/knowledge.yaml | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 076589f7..e541191b 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -121,12 +121,12 @@ def _parse_pipeline_config_file(pipeline_yaml): "The pipeline config file may have new features that will be ignored." ) - if not "block_configs" in content: + if not "blocks" in content: raise PipelineConfigParserError( - "The pipeline config file contains no 'block_configs' section" + "The pipeline config file contains no 'blocks' section" ) - return content["block_configs"] + return content["blocks"] SIMPLE_PIPELINES_PACKAGE = "instructlab.sdg.pipelines.simple" diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index 436d4240..f606295f 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_questions type: LLMBlock config: diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index c8a3d939..9f7e927f 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_contexts type: LLMBlock config: diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index 010cbbe5..a1ef7ecb 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_mmlu_knowledge type: LLMBlock config: diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml index 04491b78..de3c8f80 100644 --- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_skill_freeform type: LLMBlock config: diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index 868d3130..3c3a0f26 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_skill_grounded type: LLMBlock config: diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml index 3a1479bc..bf89c098 100644 --- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml @@ -1,5 +1,5 @@ version: "1.0" -block_configs: +blocks: - name: gen_knowledge type: LLMBlock config: From 46f16c666a2b4612c7ee34b069426126b960a970 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 12 Jul 2024 16:45:30 +0100 Subject: [PATCH 19/28] Add ImportBlock to allow extending existing pipelines This is to enable the common case of a custom pipeline that extends an existing pipeline, commonly either by prepending or appending to the existing pipeline. The format looks like e.g.: ``` version: "1.0" blocks: - - name: import_child type: ImportBlock config: path: pipelines/full/knowledge.yaml - ``` Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/importblock.py | 34 ++++++++++ src/instructlab/sdg/pipeline.py | 3 +- tests/test_importblock.py | 103 +++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/instructlab/sdg/importblock.py create mode 100644 tests/test_importblock.py diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py new file mode 100644 index 00000000..129311cb --- /dev/null +++ b/src/instructlab/sdg/importblock.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# Third Party +from datasets import Dataset + +# Local +from . import pipeline +from .block import Block +from .logger_config import setup_logger + +logger = setup_logger(__name__) + + +class ImportBlock(Block): + def __init__( + self, + ctx, + block_name, + path, + ) -> None: + """ + ImportBlock imports a chain of blocks from another pipeline config file. + + Parameters: + - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - block_name (str): An identifier for this block. + - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file. + """ + super().__init__(ctx, block_name) + self.path = path + self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path) + + def generate(self, samples) -> Dataset: + logger.info("ImportBlock chaining to blocks from {self.path}") + return self.pipeline.generate(samples) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index e541191b..bea672e1 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -8,7 +8,7 @@ import yaml # Local -from . import filterblock, llmblock, utilblocks +from . import filterblock, importblock, llmblock, utilblocks from .logger_config import setup_logger logger = setup_logger(__name__) @@ -85,6 +85,7 @@ def generate(self, dataset) -> Dataset: "CombineColumnsBlock": utilblocks.CombineColumnsBlock, "ConditionalLLMBlock": llmblock.ConditionalLLMBlock, "FilterByValueBlock": filterblock.FilterByValueBlock, + "ImportBlock": importblock.ImportBlock, "LLMBlock": llmblock.LLMBlock, "SamplePopulatorBlock": utilblocks.SamplePopulatorBlock, "SelectorBlock": utilblocks.SelectorBlock, diff --git a/tests/test_importblock.py b/tests/test_importblock.py new file mode 100644 index 00000000..1bc977de --- /dev/null +++ b/tests/test_importblock.py @@ -0,0 +1,103 @@ +# Standard +from unittest.mock import MagicMock, patch +import os +import tempfile +import unittest + +# Third Party +from datasets import Dataset, Features, Value + +# First Party +from instructlab.sdg.importblock import ImportBlock +from instructlab.sdg.pipeline import Pipeline + + +class TestImportBlockWithMockPipeline(unittest.TestCase): + @patch("instructlab.sdg.pipeline.Pipeline") + def setUp(self, mock_pipeline): + self.ctx = MagicMock() + self.block_name = "test_block" + self.path = "/path/to/config" + self.mock_pipeline = mock_pipeline + self.import_block = ImportBlock(self.ctx, self.block_name, self.path) + self.dataset = Dataset.from_dict({}) + + def test_initialization(self): + self.assertEqual(self.import_block.block_name, self.block_name) + self.assertEqual(self.import_block.path, self.path) + self.mock_pipeline.from_file.assert_called_once_with(self.ctx, self.path) + + def test_generate(self): + self.mock_pipeline.from_file.return_value.generate.return_value = self.dataset + samples = self.import_block.generate(self.dataset) + self.mock_pipeline.from_file.return_value.generate.assert_called_once_with( + samples + ) + self.assertEqual(samples, self.dataset) + + +_CHILD_YAML = """\ +version: "1.0" +blocks: +- name: greater_than_thirty + type: FilterByValueBlock + config: + filter_column: age + filter_value: 30 + operation: gt + convert_dtype: int +""" + + +_PARENT_YAML_FMT = """\ +version: "1.0" +blocks: +- name: forty_or_under + type: FilterByValueBlock + config: + filter_column: age + filter_value: 40 + operation: le + convert_dtype: int +- name: import_child + type: ImportBlock + config: + path: %s +- name: big_bdays + type: FilterByValueBlock + config: + filter_column: age + filter_value: + - 30 + - 40 + operation: eq + convert_dtype: int +""" + + +class TestImportBlockWithFilterByValue(unittest.TestCase): + def setUp(self): + self.ctx = MagicMock() + self.ctx.num_procs = 1 + self.child_yaml = self._write_tmp_yaml(_CHILD_YAML) + self.parent_yaml = self._write_tmp_yaml(_PARENT_YAML_FMT % self.child_yaml) + self.dataset = Dataset.from_dict( + {"age": ["25", "30", "35", "40", "45"]}, + features=Features({"age": Value("string")}), + ) + + def tearDown(self): + os.remove(self.parent_yaml) + os.remove(self.child_yaml) + + def _write_tmp_yaml(self, content): + tmp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".yaml") + tmp_file.write(content) + tmp_file.close() + return tmp_file.name + + def test_generate(self): + pipeline = Pipeline.from_file(self.ctx, self.parent_yaml) + filtered_dataset = pipeline.generate(self.dataset) + self.assertEqual(len(filtered_dataset), 1) + self.assertEqual(filtered_dataset["age"], [40]) From 82adb4a023f6a2d3fc5a1d87d1d2434ecf1fb3de Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 12 Jul 2024 11:54:05 -0400 Subject: [PATCH 20/28] generate_data: Allow pipeline arg to be a path to a directory In addition to `simple`, and `full`, allow a path to a directory that contains the same 3 files we include in the `sdg` library for the built-in pipelines. This will allow use of custom pipelines instead of our built-in ones if desired. Co-authored-by: Mark McLoughlin Signed-off-by: Russell Bryant --- src/instructlab/sdg/generate_data.py | 33 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 3bfb8be9..7a926a5a 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -165,18 +165,31 @@ def _gen_test_data( def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate): + pipeline_pkg = None if pipeline == "full": pipeline_pkg = FULL_PIPELINES_PACKAGE elif pipeline == "simple": pipeline_pkg = SIMPLE_PIPELINES_PACKAGE else: - raise utils.GenerateException(f"Error: pipeline ({pipeline}) is not supported.") + # Validate that pipeline is a valid directory and that it contains the required files + if not os.path.exists(pipeline): + raise utils.GenerateException( + f"Error: pipeline directory ({pipeline}) does not exist." + ) + for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]: + if not os.path.exists(os.path.join(pipeline, file)): + raise utils.GenerateException( + f"Error: pipeline directory ({pipeline}) does not contain {file}." + ) ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate) def load_pipeline(yaml_basename): - with resources.path(pipeline_pkg, yaml_basename) as yaml_path: - return Pipeline.from_file(ctx, yaml_path) + if pipeline_pkg: + with resources.path(pipeline_pkg, yaml_basename) as yaml_path: + return Pipeline.from_file(ctx, yaml_path) + else: + return Pipeline.from_file(ctx, os.path.join(pipeline, yaml_basename)) return ( SDG([load_pipeline("knowledge.yaml")]), @@ -212,9 +225,21 @@ def generate_data( tls_client_cert: Optional[str] = None, tls_client_key: Optional[str] = None, tls_client_passwd: Optional[str] = None, - # TODO need to update the CLI to specify which pipeline to use (simple or full at the moment) pipeline: Optional[str] = "simple", ): + """Generate data for training and testing a model. + + This currently serves as the primary interface from the `ilab` CLI to the `sdg` library. + It is somewhat a transitionary measure, as this function existed back when all of the + functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to + use the SDG library constructs directly, and this function will likely be removed. + + Args: + pipeline: This argument may be either an alias defined by the sdg library ("simple", "full"), + or an absolute path to a directory containing the pipeline YAML files. + We expect three files to be present in this directory: "knowledge.yaml", + "freeform_skills.yaml", and "grounded_skills.yaml". + """ generate_start = time.time() if not os.path.exists(output_dir): From 5a0b7a6f0e3d78dfec063b430b977601b8c54741 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 12 Jul 2024 14:50:08 -0400 Subject: [PATCH 21/28] llm: Set `n` by default in gen_kwargs Prior to converting to yaml format, we were setting `n` to the value of `num_instructions_to_generate`. It was dropped from the yaml since it's a runtime configuration value. We need to set it here so it's set like it was before. Co-authored-by: Mark McLoughlin Signed-off-by: Russell Bryant --- src/instructlab/sdg/llmblock.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 40304277..e3129684 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -78,6 +78,7 @@ def __init__( "model": self.ctx.model_id, "temperature": 0, "max_tokens": 12000, + "n": self.ctx.num_instructions_to_generate, } # Whether the LLM server supports a list of input prompts From 7c5c1c3def1a1c81f8e1184e6710e3fbcc27a7cd Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 12 Jul 2024 14:54:35 -0400 Subject: [PATCH 22/28] pipelines: Add missing drop_duplicates for context in grounded skills The full grounded skills pipeline begins by generating context. This block had "drop_duplicates: context" in its config, but it was accidentally dropped in the conversion to yaml. Signed-off-by: Russell Bryant --- src/instructlab/sdg/pipelines/full/grounded_skills.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index 9f7e927f..f684af6f 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -9,6 +9,8 @@ blocks: gen_kwargs: temperature: 0.7 max_tokens: 2048 + drop_duplicates: + - context - name: gen_grounded_questions type: LLMBlock config: From 04f7baa926643f312a06af3c919a812bbfe36d7b Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 12 Jul 2024 14:36:44 -0400 Subject: [PATCH 23/28] filterblock: Document block behavior in more detail Update the documentation for the parameters to reflect the updated types (strings) after the move to yaml based block configuration. While we're at it, document a list of oeprations that make sense to use with this block. Also include some examples for cases that warrant some more detailed examples: - The `contains` operation only works with strings. - All operations can take multiple candidates for the right side of the operation (filter value) and the block will check all of them and treat the result as True if any are true. - filter_column operator filter_value Signed-off-by: Russell Bryant --- src/instructlab/sdg/filterblock.py | 54 ++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index 9fcbe5c0..d43a597f 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -91,11 +91,61 @@ def __init__( - block_name (str): An identifier for this block. - filter_column (str): The name of the column in the dataset to apply the filter on. - filter_value (any or list of any): The value(s) to filter by. - - operation (callable): A function that takes two arguments (column value and filter value) and returns a boolean indicating whether the row should be included in the filtered dataset. - - convert_dtype (callable, optional): A function to convert the data type of the filter column before applying the filter. Defaults to None. + - operation (string): The name of a function provided by the "operator" + Python package that takes two arguments (column value and filter value) + and returns a boolean indicating whether the row should be included in + the filtered dataset. + - convert_dtype (string, optional): the name of a Python type to convert + the column values to. Supported values are "int", "float", and "bool". + Defaults to None. Returns: None + + For supported values of `operation`, see the "operator" package + documentation: https://docs.python.org/3/library/operator.html + + Only a subset of the "operator" package is relevant. It has to + follow the semantics of taking two parameters and returning a boolean. + Some operations that work include: + - eq: equal to + - ne: not equal to + - gt: greater than + - ge: greater than or equal to + - lt: less than + - le: less than or equal to + - contains: filter_column contains filter_value (only for string columns) + + Note that the sematics of all operations are: + - filter_column operation filter_value + + Example: FilterByValueBlock(ctx, "filter_by_age", "age", 30, "eq", "int") + - This block will filter the dataset to only include rows where the + "age" column is equal to 30. + + The `contains` operator is only supported for string columns. This is + useful if you want to ensure that a string column contains a specific + substring. + + Example: FilterByValueBlock(ctx, "filter_by_name", "full_name", "John", "contains") + - This block will filter the dataset to only include rows where the + "full_name" column contains the substring "John". + + `filter_value` does not have to be a single value. It can also be a list of values. + In that case, the operation will be applied to each value in the list. The result is + considered True if the operation is True for any of the values in the list. + + Example: FilterByValueBlock(ctx, "filter_by_age", "age", [30, 35], "eq", "int") + - This block will filter the dataset to only include rows where the + "age" column is equal to 30 or 35. + + Example: FilterByValueBlock(ctx, "filter_by_city", "city", ["boston", "charleston", "dublin", "new york"], "eq") + - This block will filter the dataset to only include rows where the + "city" column is equal to "boston", "charleston", "dublin", or "new york". + + Example: FilterByValueBlock(ctx, "filter_by_name", "full_name", ["John", "Jane"], "contains") + - This block will filter the dataset to only include rows where the + "full_name" column contains the substring "John" or "Jane". """ super().__init__(ctx, block_name) self.value = filter_value if isinstance(filter_value, list) else [filter_value] From b8768ac8c7fab019eb7a50c2cd69d18b0f4a737c Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 12 Jul 2024 17:36:24 -0400 Subject: [PATCH 24/28] Undo changes to how `n` parameter is handled I made some past changes to how we set `n` that were not correct. The fixes here include: - Re-add the one place where `n` was hard coded to 10. This was intentional and should be kept as-is. - Fix the `n` logic to be: - use what's specified for `n` in config if present - otherwise set it to 1 We never want to specify n>1 when also using a prompt that makes use of `num_samples`, as we effectively end up with `n` * `num_samples` results. This restores intended behavior of the `full` pipeline, but it also breaks applying `--num-instructions` from the CLI to be the `n` value used with the simple pipeline. That needs to be fixed in a follow-up commit. Signed-off-by: Russell Bryant --- src/instructlab/sdg/llmblock.py | 1 - src/instructlab/sdg/pipelines/full/grounded_skills.yaml | 1 + src/instructlab/sdg/pipelines/simple/grounded_skills.yaml | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index e3129684..40304277 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -78,7 +78,6 @@ def __init__( "model": self.ctx.model_id, "temperature": 0, "max_tokens": 12000, - "n": self.ctx.num_instructions_to_generate, } # Whether the LLM server supports a list of input prompts diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index f684af6f..e8e4d2d1 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -9,6 +9,7 @@ blocks: gen_kwargs: temperature: 0.7 max_tokens: 2048 + n: 10 drop_duplicates: - context - name: gen_grounded_questions diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index 3c3a0f26..ed5b1839 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -9,5 +9,6 @@ blocks: gen_kwargs: max_tokens: 2048 temperature: 0.7 + n: 10 drop_duplicates: - output From 88f5003aeae7083c9b20bd779c6c12441cc9fef2 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sat, 13 Jul 2024 00:09:14 +0100 Subject: [PATCH 25/28] Re-instate batch_kwargs.num_samples The choice of number of samples turns out to be a pipeline author thing, and shouldn't be affected by runtime parameters. Restore the original behavior. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/llmblock.py | 15 +++++++-------- .../sdg/pipelines/full/freeform_skills.yaml | 3 ++- .../sdg/pipelines/full/grounded_skills.yaml | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 40304277..d090e2b4 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -59,8 +59,8 @@ def __init__( block_name, config_path, output_cols, - add_num_samples=False, parser_kwargs={}, + batch_kwargs={}, ) -> None: super().__init__(ctx, block_name) self.block_config = self._load_config(config_path) @@ -69,8 +69,8 @@ def __init__( ) self.prompt_template = self.prompt_struct.format(**self.block_config) self.model_prompt = _get_model_prompt(self.ctx.model_family) - self.add_num_samples = add_num_samples self.output_cols = output_cols + self.batch_params = batch_kwargs self.parser_name = parser_kwargs.get("parser_name", None) self.parsing_pattern = parser_kwargs.get("parsing_pattern", None) self.parser_cleanup_tags = parser_kwargs.get("parser_cleanup_tags", None) @@ -164,12 +164,11 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: :return: The parsed output after generation. """ + num_samples = self.batch_params.get("num_samples", None) logger.debug("Generating outputs for {} samples".format(len(samples))) - if self.add_num_samples and ("num_samples" not in samples.column_names): - samples = samples.add_column( - "num_samples", [self.ctx.num_instructions_to_generate] * len(samples) - ) + if (num_samples is not None) and ("num_samples" not in samples.column_names): + samples = samples.add_column("num_samples", [num_samples] * len(samples)) # validate each sample # Log errors and remove invalid samples @@ -220,16 +219,16 @@ def __init__( config_paths, output_cols, selector_column_name, - add_num_samples=False, parser_kwargs={}, + batch_kwargs={}, ) -> None: super().__init__( ctx, block_name, config_paths[0][0], output_cols, - add_num_samples=add_num_samples, parser_kwargs=parser_kwargs, + batch_kwargs=batch_kwargs, ) self.selector_column_name = selector_column_name self.prompt_template = {} diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index f606295f..7d8d68ca 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -4,9 +4,10 @@ blocks: type: LLMBlock config: config_path: configs/skills/freeform_questions.yaml - add_num_samples: True output_cols: - question + batch_kwargs: + num_samples: 30 drop_duplicates: - question - name: eval_questions diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index e8e4d2d1..c051433c 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -16,9 +16,10 @@ blocks: type: LLMBlock config: config_path: configs/skills/grounded_questions.yaml - add_num_samples: True output_cols: - question + batch_kwargs: + num_samples: 3 drop_duplicates: - question - name: eval_grounded_questions From 804ee3a4fdca395f6f3712a4dbfd372f0492e133 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sat, 13 Jul 2024 00:41:37 +0100 Subject: [PATCH 26/28] Interpret llmblock.config_path relative to the pipeline config path Given --pipeline=/some/random/dir/for/pipelines it doesn't make sense for config_path to be relative to /some/random/dir/ - the obvious thing you'd expect is it to be relative to /some/random/dir/for/pipelines. This means config that looks like this: ``` - name: gen_questions type: LLMBlock config: config_path: ../../configs/skills/freeform_questions.yaml ``` Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/block.py | 7 +++++-- src/instructlab/sdg/filterblock.py | 4 +++- src/instructlab/sdg/importblock.py | 4 +++- src/instructlab/sdg/llmblock.py | 5 ++++- src/instructlab/sdg/pipeline.py | 11 ++++++----- .../sdg/pipelines/full/freeform_skills.yaml | 8 ++++---- .../sdg/pipelines/full/grounded_skills.yaml | 10 +++++----- .../sdg/pipelines/full/knowledge.yaml | 10 +++++----- .../sdg/pipelines/simple/freeform_skills.yaml | 2 +- .../sdg/pipelines/simple/grounded_skills.yaml | 2 +- .../sdg/pipelines/simple/knowledge.yaml | 2 +- src/instructlab/sdg/utilblocks.py | 18 ++++++++++++------ tests/test_filterblock.py | 11 ++++++++--- tests/test_importblock.py | 3 ++- 14 files changed, 60 insertions(+), 37 deletions(-) diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py index a28136c4..75b0a4e8 100644 --- a/src/instructlab/sdg/block.py +++ b/src/instructlab/sdg/block.py @@ -15,8 +15,9 @@ class Block(ABC): - def __init__(self, ctx, block_name: str) -> None: + def __init__(self, ctx, pipe, block_name: str) -> None: self.ctx = ctx + self.pipe = pipe self.block_name = block_name @staticmethod @@ -50,6 +51,8 @@ def _load_config(self, config_path: str) -> Union[Dict[str, Any], None]: :return: The loaded configuration. """ if not os.path.isabs(config_path): - config_path = os.path.join(self.ctx.sdg_base, config_path) + config_path = os.path.join( + os.path.dirname(self.pipe.config_path), config_path + ) with open(config_path, "r", encoding="utf-8") as config_file: return yaml.safe_load(config_file) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index d43a597f..3cc7b427 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -77,6 +77,7 @@ class FilterByValueBlock(Block): def __init__( self, ctx, + pipe, block_name, filter_column, filter_value, @@ -88,6 +89,7 @@ def __init__( Parameters: - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - pipe (Pipeline): The Pipeline containing this block in its chain. - block_name (str): An identifier for this block. - filter_column (str): The name of the column in the dataset to apply the filter on. - filter_value (any or list of any): The value(s) to filter by. @@ -147,7 +149,7 @@ def __init__( - This block will filter the dataset to only include rows where the "full_name" column contains the substring "John" or "Jane". """ - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.value = filter_value if isinstance(filter_value, list) else [filter_value] self.column_name = filter_column self.operation = _get_operator_func(operation) diff --git a/src/instructlab/sdg/importblock.py b/src/instructlab/sdg/importblock.py index 129311cb..5fa479b8 100644 --- a/src/instructlab/sdg/importblock.py +++ b/src/instructlab/sdg/importblock.py @@ -14,6 +14,7 @@ class ImportBlock(Block): def __init__( self, ctx, + pipe, block_name, path, ) -> None: @@ -22,10 +23,11 @@ def __init__( Parameters: - ctx (PipelineContext): A PipelineContext object containing runtime parameters. + - pipe (Pipeline): The Pipeline containing this block in its chain. - block_name (str): An identifier for this block. - path (str): A path (absolute, or relative to the instructlab.sdg package) to a pipeline config file. """ - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.path = path self.pipeline = pipeline.Pipeline.from_file(self.ctx, self.path) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index d090e2b4..3f4d32f4 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -56,13 +56,14 @@ class LLMBlock(Block): def __init__( self, ctx, + pipe, block_name, config_path, output_cols, parser_kwargs={}, batch_kwargs={}, ) -> None: - super().__init__(ctx, block_name) + super().__init__(ctx, pipe, block_name) self.block_config = self._load_config(config_path) self.prompt_struct = ( """{system}\n{introduction}\n{principles}\n{examples}\n{generation}""" @@ -215,6 +216,7 @@ class ConditionalLLMBlock(LLMBlock): def __init__( self, ctx, + pipe, block_name, config_paths, output_cols, @@ -224,6 +226,7 @@ def __init__( ) -> None: super().__init__( ctx, + pipe, block_name, config_paths[0][0], output_cols, diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index bea672e1..3ee08306 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -22,27 +22,28 @@ def __init__( self.model_family = model_family self.model_id = model_id self.num_instructions_to_generate = num_instructions_to_generate - self.sdg_base = resources.files(__package__) # FIXME: base this on the available number of CPUs self.num_procs = 8 class Pipeline: - def __init__(self, ctx, chained_blocks: list) -> None: + def __init__(self, ctx, config_path, chained_blocks: list) -> None: """ Initialize the Pipeline class with a configuration dictionary. config_dict: the run config py or yaml loaded into a dictionary """ # ctx is a PipelineContext object that supplies context configuration to every block self.ctx = ctx + # config_path is the path of the pipeline config file used to create this pipeline + self.config_path = config_path # pipeline config is the run configuration that consists of the pipeline steps self.chained_blocks = chained_blocks @classmethod def from_file(cls, ctx, pipeline_yaml): if not os.path.isabs(pipeline_yaml): - pipeline_yaml = os.path.join(ctx.sdg_base, pipeline_yaml) - return cls(ctx, _parse_pipeline_config_file(pipeline_yaml)) + pipeline_yaml = os.path.join(resources.files(__package__), pipeline_yaml) + return cls(ctx, pipeline_yaml, _parse_pipeline_config_file(pipeline_yaml)) def _drop_duplicates(self, dataset, cols): """ @@ -64,7 +65,7 @@ def generate(self, dataset) -> Dataset: drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) - block = block_type(self.ctx, block_name, **block_config) + block = block_type(self.ctx, self, block_name, **block_config) logger.info("Running block: %s", block_name) logger.info(dataset) diff --git a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml index 7d8d68ca..e14c059a 100644 --- a/src/instructlab/sdg/pipelines/full/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/freeform_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_questions type: LLMBlock config: - config_path: configs/skills/freeform_questions.yaml + config_path: ../../configs/skills/freeform_questions.yaml output_cols: - question batch_kwargs: @@ -13,7 +13,7 @@ blocks: - name: eval_questions type: LLMBlock config: - config_path: configs/skills/evaluate_freeform_questions.yaml + config_path: ../../configs/skills/evaluate_freeform_questions.yaml output_cols: - evaluation - score @@ -31,13 +31,13 @@ blocks: - name: gen_responses type: LLMBlock config: - config_path: configs/skills/freeform_responses.yaml + config_path: ../../configs/skills/freeform_responses.yaml output_cols: - response - name: evaluate_qa_pair type: LLMBlock config: - config_path: configs/skills/evaluate_freeform_pair.yaml + config_path: ../../configs/skills/evaluate_freeform_pair.yaml output_cols: - evaluation - score diff --git a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml index c051433c..8fad3b83 100644 --- a/src/instructlab/sdg/pipelines/full/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/full/grounded_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_contexts type: LLMBlock config: - config_path: configs/skills/contexts.yaml + config_path: ../../configs/skills/contexts.yaml output_cols: - context gen_kwargs: @@ -15,7 +15,7 @@ blocks: - name: gen_grounded_questions type: LLMBlock config: - config_path: configs/skills/grounded_questions.yaml + config_path: ../../configs/skills/grounded_questions.yaml output_cols: - question batch_kwargs: @@ -25,7 +25,7 @@ blocks: - name: eval_grounded_questions type: LLMBlock config: - config_path: configs/skills/evaluate_grounded_questions.yaml + config_path: ../../configs/skills/evaluate_grounded_questions.yaml output_cols: - evaluation - score @@ -43,13 +43,13 @@ blocks: - name: gen_grounded_responses type: LLMBlock config: - config_path: configs/skills/grounded_responses.yaml + config_path: ../../configs/skills/grounded_responses.yaml output_cols: - response - name: evaluate_grounded_qa_pair type: LLMBlock config: - config_path: configs/skills/evaluate_grounded_pair.yaml + config_path: ../../configs/skills/evaluate_grounded_pair.yaml output_cols: - evaluation - score diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index a1ef7ecb..21802921 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_mmlu_knowledge type: LLMBlock config: - config_path: configs/knowledge/mcq_generation.yaml + config_path: ../../configs/knowledge/mcq_generation.yaml output_cols: - mmlubench_question - mmlubench_answer @@ -15,7 +15,7 @@ blocks: - name: gen_knowledge type: LLMBlock config: - config_path: configs/knowledge/generate_questions_responses.yaml + config_path: ../../configs/knowledge/generate_questions_responses.yaml output_cols: - question - response @@ -31,7 +31,7 @@ blocks: - name: eval_faithfulness_qa_pair type: LLMBlock config: - config_path: configs/knowledge/evaluate_faithfulness.yaml + config_path: ../../configs/knowledge/evaluate_faithfulness.yaml output_cols: - explanation - judgment @@ -49,7 +49,7 @@ blocks: - name: eval_relevancy_qa_pair type: LLMBlock config: - config_path: configs/knowledge/evaluate_relevancy.yaml + config_path: ../../configs/knowledge/evaluate_relevancy.yaml output_cols: - feedback - score @@ -68,7 +68,7 @@ blocks: - name: eval_verify_question type: LLMBlock config: - config_path: configs/knowledge/evaluate_question.yaml + config_path: ../../configs/knowledge/evaluate_question.yaml output_cols: - explanation - rating diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml index de3c8f80..be589af8 100644 --- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_skill_freeform type: LLMBlock config: - config_path: configs/skills/simple_generate_qa_freeform.yaml + config_path: ../../configs/skills/simple_generate_qa_freeform.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml index ed5b1839..23925034 100644 --- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml +++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_skill_grounded type: LLMBlock config: - config_path: configs/skills/simple_generate_qa_grounded.yaml + config_path: ../../configs/skills/simple_generate_qa_grounded.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml index bf89c098..7e2cdc4f 100644 --- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml @@ -3,7 +3,7 @@ blocks: - name: gen_knowledge type: LLMBlock config: - config_path: configs/knowledge/simple_generate_qa.yaml + config_path: ../../configs/knowledge/simple_generate_qa.yaml output_cols: - output gen_kwargs: diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index b4e39a5b..6c503d28 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -10,8 +10,10 @@ class SamplePopulatorBlock(Block): - def __init__(self, ctx, block_name, config_paths, column_name, post_fix="") -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, config_paths, column_name, post_fix="" + ) -> None: + super().__init__(ctx, pipe, block_name) self.configs = {} for config in config_paths: if post_fix: @@ -37,8 +39,10 @@ def generate(self, samples) -> Dataset: class SelectorBlock(Block): - def __init__(self, ctx, block_name, choice_map, choice_col, output_col) -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, choice_map, choice_col, output_col + ) -> None: + super().__init__(ctx, pipe, block_name) self.choice_map = choice_map self.choice_col = choice_col self.output_col = output_col @@ -63,8 +67,10 @@ def generate(self, samples: Dataset) -> Dataset: class CombineColumnsBlock(Block): - def __init__(self, ctx, block_name, columns, output_col, separator="\n\n") -> None: - super().__init__(ctx, block_name) + def __init__( + self, ctx, pipe, block_name, columns, output_col, separator="\n\n" + ) -> None: + super().__init__(ctx, pipe, block_name) self.columns = columns self.output_col = output_col self.separator = separator diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py index cec4eff5..5dcc4d1b 100644 --- a/tests/test_filterblock.py +++ b/tests/test_filterblock.py @@ -1,5 +1,5 @@ # Standard -from unittest.mock import patch +from unittest.mock import MagicMock, patch import operator import unittest @@ -13,8 +13,12 @@ class TestFilterByValueBlock(unittest.TestCase): def setUp(self): + self.ctx = MagicMock() + self.ctx.num_procs = 1 + self.pipe = MagicMock() self.block = FilterByValueBlock( - PipelineContext(None, None, None, None), + self.ctx, + self.pipe, "filter_by_age", filter_column="age", filter_value="30", @@ -22,7 +26,8 @@ def setUp(self): convert_dtype="int", ) self.block_with_list = FilterByValueBlock( - PipelineContext(None, None, None, None), + self.ctx, + self.pipe, "filter_by_age_list", filter_column="age", filter_value=["30", "35"], diff --git a/tests/test_importblock.py b/tests/test_importblock.py index 1bc977de..80baf215 100644 --- a/tests/test_importblock.py +++ b/tests/test_importblock.py @@ -16,10 +16,11 @@ class TestImportBlockWithMockPipeline(unittest.TestCase): @patch("instructlab.sdg.pipeline.Pipeline") def setUp(self, mock_pipeline): self.ctx = MagicMock() + self.pipe = MagicMock() self.block_name = "test_block" self.path = "/path/to/config" self.mock_pipeline = mock_pipeline - self.import_block = ImportBlock(self.ctx, self.block_name, self.path) + self.import_block = ImportBlock(self.ctx, self.pipe, self.block_name, self.path) self.dataset = Dataset.from_dict({}) def test_initialization(self): From d1c5d5bf189a2dd31b82cdd7d57b0c24603d9e96 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sat, 13 Jul 2024 01:30:55 +0100 Subject: [PATCH 27/28] Ensure num_proc is passed as a keyword arg to Dataset.map() Fix a couple of calls where it's being passed as a positional arg, and the second positional arg is with_`indices`. Signed-off-by: Mark McLoughlin --- src/instructlab/sdg/utilblocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 6c503d28..02b536f5 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -30,7 +30,7 @@ def _map_populate(samples, configs, column_name, num_proc=1): def populate(sample): return {**sample, **configs[sample[column_name]]} - return samples.map(populate, num_proc) + return samples.map(populate, num_proc=num_proc) def generate(self, samples) -> Dataset: return self._map_populate_samples( @@ -54,7 +54,7 @@ def select_choice(sample) -> dict: sample[output_col] = sample[choice_map[sample[choice_col]]] return sample - return samples.map(select_choice, num_proc) + return samples.map(select_choice, num_proc=num_proc) def generate(self, samples: Dataset) -> Dataset: return self._map_select_choice( From 2c527702bb8ddebfe786a6dd4eb58b815cfb44fd Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Fri, 12 Jul 2024 22:27:04 -0400 Subject: [PATCH 28/28] fix: use string instead of boolean in YAML for "YES" `field: YES` will be parsed to boolean in YAML, and resulting `"field": True` in Python. This makes any use of "field" as a string problematic in the code. This commit fixes this bug by quoting it properly. Signed-off-by: Kai Xu --- src/instructlab/sdg/pipelines/full/knowledge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/pipelines/full/knowledge.yaml b/src/instructlab/sdg/pipelines/full/knowledge.yaml index 21802921..2b9e9c8d 100644 --- a/src/instructlab/sdg/pipelines/full/knowledge.yaml +++ b/src/instructlab/sdg/pipelines/full/knowledge.yaml @@ -41,7 +41,7 @@ blocks: type: FilterByValueBlock config: filter_column: judgment - filter_value: YES + filter_value: "YES" operation: eq drop_columns: - judgment