diff --git a/.github/workflows/python-test-push.yml b/.github/workflows/python-test-push.yml new file mode 100644 index 0000000..9b37101 --- /dev/null +++ b/.github/workflows/python-test-push.yml @@ -0,0 +1,27 @@ +name: Python Automated Testing + +on: [push] + +jobs: + run_tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install the project + run: uv sync --all-extras --dev + + - name: Run tests + run: uv run pytest tests diff --git a/README.md b/README.md index d040917..79ab9cc 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0z > "I like them big, I like them chonkie." > ~ Moto Moto + # Citation If you use Chonkie in your research, please cite it as follows: diff --git a/pyproject.toml b/pyproject.toml index ecbb6a9..d897d7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ dev = [ "ruff>=0.0.265" ] +[tool.pytest.ini_options] +pythonpath = "src" + [tool.setuptools] package-dir = {"" = "src"} packages = ["chonkie", @@ -56,4 +59,4 @@ packages = ["chonkie", "chonkie.embeddings"] [tool.ruff] -select = ["I", "D"] +select = ["F", "I", "D", "DOC"] diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py index e804359..9319075 100644 --- a/src/chonkie/chunker/base.py +++ b/src/chonkie/chunker/base.py @@ -34,22 +34,22 @@ class BaseChunker(ABC): the chunk() method according to their specific chunking strategy. """ - def __init__(self, tokenizer: Union[str, Any, Callable[[str], int]]): + def __init__(self, tokenizer_or_token_counter: Union[str, Any, Callable[[str], int]]): """Initialize the chunker with a tokenizer. Args: tokenizer_or_token_counter (Union[str, Any]): String, tokenizer object, or token counter object """ - if callable(tokenizer): + if callable(tokenizer_or_token_counter): self.tokenizer = None self._tokenizer_backend = "callable" - self.token_counter = tokenizer - if isinstance(tokenizer, str): - self.tokenizer = self._load_tokenizer(tokenizer) + self.token_counter = tokenizer_or_token_counter + elif isinstance(tokenizer_or_token_counter, str): + self.tokenizer = self._load_tokenizer(tokenizer_or_token_counter) self.token_counter = self._get_tokenizer_counter() else: - self.tokenizer = tokenizer + self.tokenizer = tokenizer_or_token_counter self._tokenizer_backend = self._get_tokenizer_backend() self.token_counter = self._get_tokenizer_counter() diff --git a/tests/chunker/test_semantic_chunker.py b/tests/chunker/test_semantic_chunker.py index 09e2422..09bf015 100644 --- a/tests/chunker/test_semantic_chunker.py +++ b/tests/chunker/test_semantic_chunker.py @@ -85,6 +85,10 @@ def test_semantic_chunker_initialization(embedding_model): assert chunker.initial_sentences == 1 +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_semantic_chunker_initialization_openai(openai_embedding_model): """Test that the SemanticChunker can be initialized with required parameters.""" chunker = SemanticChunker( diff --git a/tests/embeddings/test_openai_embeddings.py b/tests/embeddings/test_openai_embeddings.py index 98596ce..7ae2010 100644 --- a/tests/embeddings/test_openai_embeddings.py +++ b/tests/embeddings/test_openai_embeddings.py @@ -26,18 +26,30 @@ def sample_texts(): ] +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_initialization_with_model_name(): embeddings = OpenAIEmbeddings(model="text-embedding-3-small") assert embeddings.model == "text-embedding-3-small" assert embeddings.client is not None +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_embed_single_text(embedding_model, sample_text): embedding = embedding_model.embed(sample_text) assert isinstance(embedding, np.ndarray) assert embedding.shape == (embedding_model.dimension,) +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_embed_batch_texts(embedding_model, sample_texts): embeddings = embedding_model.embed_batch(sample_texts) assert isinstance(embeddings, list) @@ -48,12 +60,20 @@ def test_embed_batch_texts(embedding_model, sample_texts): ) +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_count_tokens_single_text(embedding_model, sample_text): token_count = embedding_model.count_tokens(sample_text) assert isinstance(token_count, int) assert token_count > 0 +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_count_tokens_batch_texts(embedding_model, sample_texts): token_counts = embedding_model.count_tokens_batch(sample_texts) assert isinstance(token_counts, list) @@ -62,6 +82,10 @@ def test_count_tokens_batch_texts(embedding_model, sample_texts): assert all(count > 0 for count in token_counts) +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_similarity(embedding_model, sample_texts): embeddings = embedding_model.embed_batch(sample_texts) similarity_score = embedding_model.similarity(embeddings[0], embeddings[1]) @@ -69,6 +93,10 @@ def test_similarity(embedding_model, sample_texts): assert 0.0 <= similarity_score <= 1.0 +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_dimension_property(embedding_model): assert isinstance(embedding_model.dimension, int) assert embedding_model.dimension > 0 @@ -78,6 +106,10 @@ def test_is_available(): assert OpenAIEmbeddings.is_available() is True +@pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="Skipping test because OPENAI_API_KEY is not defined", +) def test_repr(embedding_model): repr_str = repr(embedding_model) assert isinstance(repr_str, str)