Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Block Name In Errors #155

Merged
merged 5 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions src/instructlab/sdg/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

# Local
from . import filterblock, importblock, llmblock, utilblocks
from .block import Block
from .logger_config import setup_logger

logger = setup_logger(__name__)
Expand All @@ -32,6 +33,32 @@ def __init__(
self.num_procs = 8


# This is part of the public API.
class BlockGenerationError(Exception):
gabe-l-hart marked this conversation as resolved.
Show resolved Hide resolved
"""A BlockGenerationError occurs when a block generates an exception during
generation. It contains information about which block failed and why.
"""

def __init__(self, block: Block, exception: Exception):
self.block = block
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: underscore prefix this if you want just .block_name and .block_type to be used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left that one "public" thinking that some might like to be able to actually muck with the block itself when handling the exception.

self.exception = exception

def __str__(self) -> str:
return f"{self.__class__.__name__}({self.block_type}/{self.block_name}): {self.exception_message}"

@property
def block_name(self) -> str:
return self.block.block_name

@property
def block_type(self) -> str:
return self.block.__class__.__name__

@property
def exception_message(self) -> str:
return str(self.exception)


# This is part of the public API.
class Pipeline:
def __init__(self, ctx, config_path, chained_blocks: list) -> None:
Expand Down Expand Up @@ -67,17 +94,21 @@ def generate(self, dataset) -> Dataset:
dataset: the input dataset
"""
for block_prop in self.chained_blocks:
# Parse and instantiate the block
block_name = block_prop["name"]
block_type = _lookup_block_type(block_prop["type"])
block_config = block_prop["config"]
drop_columns = block_prop.get("drop_columns", [])
drop_duplicates_cols = block_prop.get("drop_duplicates", False)
block = block_type(self.ctx, self, block_name, **block_config)
gabe-l-hart marked this conversation as resolved.
Show resolved Hide resolved

logger.info("Running block: %s", block_name)
logger.info(dataset)

dataset = block.generate(dataset)
# Execute the block and wrap errors with the block name/type
try:
dataset = block.generate(dataset)
except Exception as err:
raise BlockGenerationError(block=block, exception=err) from err

# If at any point we end up with an empty data set, the pipeline has failed
if len(dataset) == 0:
Expand Down
68 changes: 68 additions & 0 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Unit tests for common Pipeline functionality
"""

# Standard
from unittest import mock

# Third Party
from datasets import Dataset
import pytest

# First Party
from instructlab.sdg.block import Block
from instructlab.sdg.pipeline import BlockGenerationError, Pipeline


def test_pipeline_named_errors_match_type():
"""Validate that a BlockGenerationError is raised to wrap exceptions raised
in a Block's generate method
"""
mock_dataset = ["not empty"]
working_block = mock.MagicMock()
working_block().generate.return_value = mock_dataset
failure_block = mock.MagicMock()
failure_block.__name__ = "BadBlock"
failure_exc = RuntimeError("Oh no!")
failure_block().generate = mock.MagicMock(side_effect=failure_exc)
pipe_cfg = [
{"name": "I work", "type": "working", "config": {}},
{"name": "I don't", "type": "failure", "config": {}},
]
with mock.patch(
"instructlab.sdg.pipeline._block_types",
{
"working": working_block,
"failure": failure_block,
},
):
pipe = Pipeline(None, None, pipe_cfg)
with pytest.raises(BlockGenerationError) as exc_ctx:
pipe.generate(None)

assert exc_ctx.value.__cause__ is failure_exc
assert exc_ctx.value.exception is failure_exc
assert exc_ctx.value.block is failure_block()


def test_block_generation_error_properties():
"""Make sure the BlockGenerationError exposes its properties and string form
correctly
"""

class TestBlock(Block):
def generate(self, dataset: Dataset) -> Dataset:
return dataset

block_name = "my-block"
block = TestBlock(None, None, block_name)
inner_err = TypeError("Not the right type")
gen_err = BlockGenerationError(block, inner_err)
assert gen_err.block is block
assert gen_err.exception is inner_err
assert gen_err.block_name is block_name
assert gen_err.block_type == TestBlock.__name__
assert (
str(gen_err)
== f"{BlockGenerationError.__name__}({TestBlock.__name__}/{block_name}): {inner_err}"
)