Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dataset formatting for pipeline differences #57

Merged
merged 3 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/instructlab/sdg/configs/knowledge/simple_generate_qa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ Here are the requirements:
examples: |
Here are some examples to help you understand the type of questions that are asked for this document:

{question_1}
{response_1}
{icl_query_1}
{icl_response_1}

{question_2}
{response_2}
{icl_query_2}
{icl_response_2}

{question_3}
{response_3}
{icl_query_3}
{icl_response_3}

Here is the document:
{document}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,10 @@ Here are the requirements:
examples: |
The task is {task_description}.

Here are some examples to help you understand the type of questions that are asked for:
Here is an example to help you understand the type of questions that are asked for:

{question_1}
{response_1}

{question_2}
{response_2}

{question_3}
{response_3}
{seed_question}
{seed_response}

generation: |
Provide a single question and answer pair based on the examples.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,17 @@ Here are the requirements:
examples: |
The task is {task_description}.

Here is some context for the example questions:
Here is some context for the example question:

{context}
{seed_context}

Here are some examples to help you understand the type of questions that are asked for:
Here is an example to help you understand the type of questions that are asked for:

{question_1}
{response_1}

{question_2}
{response_2}

{question_3}
{response_3}
{seed_question}
{seed_response}

generation: |
Provide a single question and answer pair based on the examples.
Provide a single question and answer pair based on the example.

start_tags: [""]
end_tags: [""]
14 changes: 2 additions & 12 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
SynthSkillsFlow,
)
from instructlab.sdg.pipeline import Pipeline
from instructlab.sdg.utils import chunking, models
from instructlab.sdg.utils import models
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
read_taxonomy_leaf_nodes,
Expand Down Expand Up @@ -270,7 +270,7 @@ def generate_data(

generated_data = None
for leaf_node in leaf_nodes.values():
samples = leaf_node_to_samples(leaf_node)
samples = leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count)

if not samples:
raise utils.GenerateException("Error: No samples found in leaf node.")
Expand All @@ -290,16 +290,6 @@ def generate_data(
"Error: No SDG pipeline for this leaf node type: %s" % samples[0]
)

# TODO this is broken, just trying to get initial integration to run
# pylint: disable=consider-using-enumerate
if samples[0].get("document"):
for i in range(len(samples)):
samples[i]["document"] = chunking.chunk_document(
documents=samples[i]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)[0]

# TODO -- there is a parameter for how many samples to generate, but we ignore it so far

logger.debug("Samples: %s" % samples)
Expand Down
104 changes: 72 additions & 32 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# First Party
from instructlab.sdg import utils
from instructlab.sdg.utils import chunking

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -415,42 +416,81 @@ def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules):
return leaf_nodes


def leaf_node_to_samples(leaf_node):
def _knowledge_leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
samples = [{}]

# document is the same for the whole leaf node
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need a bit of clarification here --
Are we expecting just one document at a time? Because in a leaf node, we could have multiple documents as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

chunk_document() handles multiple documents. take a look here:

for docs in documents:
temp = text_splitter.create_documents([docs])
content.extend([item.page_content for item in temp])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@aakankshaduggal i'm going too hold off merging until you confirm this makes sense to you -- code may not be super clear, but I do think it's handling multiple documents properly

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay thanks for clarifying @russellb!
Makes sense. 💯

chunks = (
chunking.chunk_document(
documents=leaf_node[0]["document"],
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
)
if leaf_node[0].get("document")
else []
)

# domain is the same for the whole leaf node
domain = leaf_node[0].get("domain")

for chunk in chunks:
# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
samples[-1].setdefault("domain", domain)
samples[-1].setdefault("document", chunk)
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
if "icl_query_3" in samples[-1]:
samples.append({})
if "icl_query_1" not in samples[-1]:
samples[-1]["icl_query_1"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_1"] = leaf_node[i]["output"]
elif "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[i]["output"]
else:
samples[-1]["icl_query_3"] = leaf_node[i]["instruction"]
samples[-1]["icl_response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "icl_query_2" not in samples[-1]:
samples[-1]["icl_query_2"] = leaf_node[0]["instruction"]
samples[-1]["icl_response_2"] = leaf_node[0]["output"]
if "icl_query_3" not in samples[-1]:
samples[-1]["icl_query_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["icl_response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"output"
]

return samples


def _skill_leaf_node_to_samples(leaf_node):
samples = []

# pylint: disable=consider-using-enumerate
for i in range(len(leaf_node)):
samples[-1].setdefault("task_description", leaf_node[i]["task_description"])
for field in ["document", "domain"]:
if leaf_node[i].get(field):
samples[-1].setdefault(field, leaf_node[i][field])
if samples[-1].get("document") and not samples[-1].get("domain"):
raise utils.GenerateException(
"Error: No domain provided for knowledge document in leaf node"
)
samples.append({})
samples[-1]["task_description"] = leaf_node[i]["task_description"]
if leaf_node[i].get("input"):
samples[-1].setdefault("context", leaf_node[i]["input"])
if "question_3" in samples[-1]:
samples.append({})
if "question_1" not in samples[-1]:
samples[-1]["question_1"] = leaf_node[i]["instruction"]
samples[-1]["response_1"] = leaf_node[i]["output"]
elif "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[i]["instruction"]
samples[-1]["response_2"] = leaf_node[i]["output"]
else:
samples[-1]["question_3"] = leaf_node[i]["instruction"]
samples[-1]["response_3"] = leaf_node[i]["output"]

# wrap back around to the beginning if the number of examples was not
# evenly divisble by 3
if "question_2" not in samples[-1]:
samples[-1]["question_2"] = leaf_node[0]["instruction"]
samples[-1]["response_2"] = leaf_node[0]["output"]
if "question_3" not in samples[-1]:
samples[-1]["question_3"] = leaf_node[1 if len(leaf_node) > 1 else 0][
"instruction"
]
samples[-1]["response_3"] = leaf_node[1 if len(leaf_node) > 1 else 0]["output"]
samples[-1]["seed_context"] = leaf_node[i]["input"]
samples[-1]["seed_question"] = leaf_node[i]["instruction"]
samples[-1]["seed_response"] = leaf_node[i]["output"]

return samples


def leaf_node_to_samples(leaf_node, server_ctx_size, chunk_word_count):
if not leaf_node:
return []
if leaf_node[0].get("document"):
return _knowledge_leaf_node_to_samples(
leaf_node, server_ctx_size, chunk_word_count
)
return _skill_leaf_node_to_samples(leaf_node)