From 811212353016e0307ef1399f6dd50ec60e44f09d Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 30 Jun 2024 14:02:12 -0400 Subject: [PATCH 1/3] Re-introduce document chunking for knowledge When generating samples for a knowledge pipeline, we have to chunk the document down to a size that will fit within the model's context size. There was a hack in place that only used a single chunk. The code now iterates over all chunks of the document for creating samples to send through the pipeline. The commit also separates the code for knowledge and skills since the differences between the formats is growing. Closes #52 Signed-off-by: Russell Bryant --- src/instructlab/sdg/generate_data.py | 14 +---- src/instructlab/sdg/utils/taxonomy.py | 75 ++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 20 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 2c812361..66a2987e 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -30,7 +30,7 @@ SynthSkillsFlow, ) from instructlab.sdg.pipeline import Pipeline -from instructlab.sdg.utils import chunking, models +from instructlab.sdg.utils import models from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, @@ -270,7 +270,7 @@ def generate_data( generated_data = None for leaf_node in leaf_nodes.values(): - samples = leaf_node_to_samples(leaf_node) + samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count) if not samples: raise utils.GenerateException("Error: No samples found in leaf node.") @@ -290,16 +290,6 @@ def generate_data( "Error: No SDG pipeline for this leaf node type: %s" % samples[0] ) - # TODO this is broken, just trying to get initial integration to run - # pylint: disable=consider-using-enumerate - if samples[0].get("document"): - for i in range(len(samples)): - samples[i]["document"] = chunking.chunk_document( - documents=samples[i]["document"], - server_ctx_size=server_ctx_size, - chunk_word_count=chunk_word_count, - )[0] - # TODO -- there is a parameter for how many samples to generate, but we ignore it so far logger.debug("Samples: %s" % samples) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 9e62baa5..d11dc92e 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -19,6 +19,7 @@ # First Party from instructlab.sdg import utils +from instructlab.sdg.utils import chunking logger = logging.getLogger(__name__) @@ -415,19 +416,67 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules): return leaf_nodes -def leaf_node_to_samples(leaf_node): +def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): + samples = [{}] + + # document is the same for the whole leaf node + chunks = ( + chunking.chunk_document( + documents=leaf_node[0]["document"], + server_ctx_size=server_ctx_size, + chunk_word_count=chunk_word_count, + ) + if leaf_node[0].get("document") + else [] + ) + + # domain is the same for the whole leaf node + domain = leaf_node[0].get("domain") + + for chunk in chunks: + # pylint: disable=consider-using-enumerate + for i in range(len(leaf_node)): + samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) + samples[-1].setdefault("domain", domain) + samples[-1].setdefault("document", chunk) + if samples[-1].get("document") and not samples[-1].get("domain"): + raise utils.GenerateException( + "Error: No domain provided for knowledge document in leaf node" + ) + if "question_3" in samples[-1]: + samples.append({}) + if "question_1" not in samples[-1]: + samples[-1]["question_1"] = leaf_node[i]["instruction"] + samples[-1]["response_1"] = leaf_node[i]["output"] + elif "question_2" not in samples[-1]: + samples[-1]["question_2"] = leaf_node[i]["instruction"] + samples[-1]["response_2"] = leaf_node[i]["output"] + else: + samples[-1]["question_3"] = leaf_node[i]["instruction"] + samples[-1]["response_3"] = leaf_node[i]["output"] + + # wrap back around to the beginning if the number of examples was not + # evenly divisble by 3 + if "question_2" not in samples[-1]: + samples[-1]["question_2"] = leaf_node[0]["instruction"] + samples[-1]["response_2"] = leaf_node[0]["output"] + if "question_3" not in samples[-1]: + samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + "instruction" + ] + samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + "output" + ] + + return samples + + +def _skill_leaf_node_to_samples(leaf_node): samples = [{}] # pylint: disable=consider-using-enumerate for i in range(len(leaf_node)): samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) - for field in ["document", "domain"]: - if leaf_node[i].get(field): - samples[-1].setdefault(field, leaf_node[i][field]) - if samples[-1].get("document") and not samples[-1].get("domain"): - raise utils.GenerateException( - "Error: No domain provided for knowledge document in leaf node" - ) if leaf_node[i].get("input"): samples[-1].setdefault("context", leaf_node[i]["input"]) if "question_3" in samples[-1]: @@ -454,3 +503,13 @@ def leaf_node_to_samples(leaf_node): samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] return samples + + +def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): + if not leaf_node: + return [] + if "document" in leaf_node[0]: + return _knowledge_leaf_node_to_samples( + leaf_node, server_ctx_size, chunk_word_count + ) + return _skill_leaf_node_to_samples(leaf_node) From 15ae2b947369716a27e26ee8cd138e79fce4f45a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 30 Jun 2024 14:08:39 -0400 Subject: [PATCH 2/3] Change question/response to icl_query/icl_response PR #50 changed the format used in the full knowledge pipeline. Change the simple pipelines to match. Part of issue #55. Signed-off-by: Russell Bryant --- .../configs/knowledge/simple_generate_qa.yaml | 12 ++-- .../skills/simple_generate_qa_freeform.yaml | 12 ++-- .../skills/simple_generate_qa_grounded.yaml | 12 ++-- src/instructlab/sdg/utils/taxonomy.py | 60 +++++++++---------- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml index 9ad6fa77..c63b4209 100644 --- a/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml +++ b/src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml @@ -15,14 +15,14 @@ Here are the requirements: examples: | Here are some examples to help you understand the type of questions that are asked for this document: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} Here is the document: {document} diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 2913d7df..9abc1950 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -17,14 +17,14 @@ examples: | Here are some examples to help you understand the type of questions that are asked for: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index fe48c99c..f40d3d11 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -21,14 +21,14 @@ examples: | Here are some examples to help you understand the type of questions that are asked for: - {question_1} - {response_1} + {icl_query_1} + {icl_response_1} - {question_2} - {response_2} + {icl_query_2} + {icl_response_2} - {question_3} - {response_3} + {icl_query_3} + {icl_response_3} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index d11dc92e..da9ffa11 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -443,28 +443,28 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count raise utils.GenerateException( "Error: No domain provided for knowledge document in leaf node" ) - if "question_3" in samples[-1]: + if "icl_query_3" in samples[-1]: samples.append({}) - if "question_1" not in samples[-1]: - samples[-1]["question_1"] = leaf_node[i]["instruction"] - samples[-1]["response_1"] = leaf_node[i]["output"] - elif "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[i]["instruction"] - samples[-1]["response_2"] = leaf_node[i]["output"] + if "icl_query_1" not in samples[-1]: + samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_1"] = leaf_node[i]["output"] + elif "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[i]["output"] else: - samples[-1]["question_3"] = leaf_node[i]["instruction"] - samples[-1]["response_3"] = leaf_node[i]["output"] + samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_3"] = leaf_node[i]["output"] # wrap back around to the beginning if the number of examples was not # evenly divisble by 3 - if "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[0]["instruction"] - samples[-1]["response_2"] = leaf_node[0]["output"] - if "question_3" not in samples[-1]: - samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + if "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[0]["output"] + if "icl_query_3" not in samples[-1]: + samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "instruction" ] - samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "output" ] @@ -479,28 +479,28 @@ def _skill_leaf_node_to_samples(leaf_node): samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) if leaf_node[i].get("input"): samples[-1].setdefault("context", leaf_node[i]["input"]) - if "question_3" in samples[-1]: + if "icl_query_3" in samples[-1]: samples.append({}) - if "question_1" not in samples[-1]: - samples[-1]["question_1"] = leaf_node[i]["instruction"] - samples[-1]["response_1"] = leaf_node[i]["output"] - elif "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[i]["instruction"] - samples[-1]["response_2"] = leaf_node[i]["output"] + if "icl_query_1" not in samples[-1]: + samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_1"] = leaf_node[i]["output"] + elif "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[i]["output"] else: - samples[-1]["question_3"] = leaf_node[i]["instruction"] - samples[-1]["response_3"] = leaf_node[i]["output"] + samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] + samples[-1]["icl_response_3"] = leaf_node[i]["output"] # wrap back around to the beginning if the number of examples was not # evenly divisble by 3 - if "question_2" not in samples[-1]: - samples[-1]["question_2"] = leaf_node[0]["instruction"] - samples[-1]["response_2"] = leaf_node[0]["output"] - if "question_3" not in samples[-1]: - samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ + if "icl_query_2" not in samples[-1]: + samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] + samples[-1]["icl_response_2"] = leaf_node[0]["output"] + if "icl_query_3" not in samples[-1]: + samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ "instruction" ] - samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] return samples From e6068112bfd19ae6f2c7f8fd13ba0a366584ea02 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sun, 30 Jun 2024 14:25:25 -0400 Subject: [PATCH 3/3] Create a sample per seed example for skills The full skills pipelines expect a single seed question and response in each sample in the dataset. Change the simple skills pipelines to match and update the code to generate the samples in the expected format. Closes #55 (the short term needs at least) Signed-off-by: Russell Bryant --- .../skills/simple_generate_qa_freeform.yaml | 12 ++----- .../skills/simple_generate_qa_grounded.yaml | 18 ++++------ src/instructlab/sdg/utils/taxonomy.py | 33 ++++--------------- 3 files changed, 16 insertions(+), 47 deletions(-) diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml index 9abc1950..d584ac33 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_freeform.yaml @@ -15,16 +15,10 @@ Here are the requirements: examples: | The task is {task_description}. - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | Provide a single question and answer pair based on the examples. diff --git a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml index f40d3d11..2ac41a82 100644 --- a/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml +++ b/src/instructlab/sdg/configs/skills/simple_generate_qa_grounded.yaml @@ -15,23 +15,17 @@ Here are the requirements: examples: | The task is {task_description}. - Here is some context for the example questions: + Here is some context for the example question: - {context} + {seed_context} - Here are some examples to help you understand the type of questions that are asked for: + Here is an example to help you understand the type of questions that are asked for: - {icl_query_1} - {icl_response_1} - - {icl_query_2} - {icl_response_2} - - {icl_query_3} - {icl_response_3} + {seed_question} + {seed_response} generation: | - Provide a single question and answer pair based on the examples. + Provide a single question and answer pair based on the example. start_tags: [""] end_tags: [""] diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index da9ffa11..d6f6441b 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -472,35 +472,16 @@ def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count def _skill_leaf_node_to_samples(leaf_node): - samples = [{}] + samples = [] # pylint: disable=consider-using-enumerate for i in range(len(leaf_node)): - samples[-1].setdefault("task_description", leaf_node[i]["task_description"]) + samples.append({}) + samples[-1]["task_description"] = leaf_node[i]["task_description"] if leaf_node[i].get("input"): - samples[-1].setdefault("context", leaf_node[i]["input"]) - if "icl_query_3" in samples[-1]: - samples.append({}) - if "icl_query_1" not in samples[-1]: - samples[-1]["icl_query_1"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_1"] = leaf_node[i]["output"] - elif "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[i]["output"] - else: - samples[-1]["icl_query_3"] = leaf_node[i]["instruction"] - samples[-1]["icl_response_3"] = leaf_node[i]["output"] - - # wrap back around to the beginning if the number of examples was not - # evenly divisble by 3 - if "icl_query_2" not in samples[-1]: - samples[-1]["icl_query_2"] = leaf_node[0]["instruction"] - samples[-1]["icl_response_2"] = leaf_node[0]["output"] - if "icl_query_3" not in samples[-1]: - samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][ - "instruction" - ] - samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"] + samples[-1]["seed_context"] = leaf_node[i]["input"] + samples[-1]["seed_question"] = leaf_node[i]["instruction"] + samples[-1]["seed_response"] = leaf_node[i]["output"] return samples @@ -508,7 +489,7 @@ def _skill_leaf_node_to_samples(leaf_node): def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count): if not leaf_node: return [] - if "document" in leaf_node[0]: + if leaf_node[0].get("document"): return _knowledge_leaf_node_to_samples( leaf_node, server_ctx_size, chunk_word_count )