chonkie-ai · bhavnicksm · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.github/workflows/python-test-push.yml b/.github/workflows/python-test-push.yml
@@ -0,0 +1,27 @@
+name: Python Automated Testing
+
+on: [push]
+
+jobs:
+  run_tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "pyproject.toml"
+
+      - name: "Set up Python"
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "pyproject.toml"
+
+      - name: Install the project
+        run: uv sync --all-extras --dev
+
+      - name: Run tests
+        run: uv run pytest tests
diff --git a/README.md b/README.md
@@ -118,6 +118,7 @@ And of course, special thanks to [Moto Moto](https://www.youtube.com/watch?v=I0z
 > "I like them big, I like them chonkie."
 >                                         ~ Moto Moto
 
+
 # Citation
 
 If you use Chonkie in your research, please cite it as follows:

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,11 +49,14 @@ dev = [
     "ruff>=0.0.265"
 ]
 
+[tool.pytest.ini_options]
+pythonpath = "src"
+
 [tool.setuptools]
 package-dir = {"" = "src"}
 packages = ["chonkie", 
             "chonkie.chunker",
             "chonkie.embeddings"]
 
 [tool.ruff]
-select = ["I", "D"]
+select = ["F", "I", "D", "DOC"]
diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -34,22 +34,22 @@ class BaseChunker(ABC):
     the chunk() method according to their specific chunking strategy.
     """
 
-    def __init__(self, tokenizer: Union[str, Any, Callable[[str], int]]):
+    def __init__(self, tokenizer_or_token_counter: Union[str, Any, Callable[[str], int]]):
         """Initialize the chunker with a tokenizer.
 
         Args:
             tokenizer_or_token_counter (Union[str, Any]): String, tokenizer object, or token counter object
 
         """
-        if callable(tokenizer):
+        if callable(tokenizer_or_token_counter):
             self.tokenizer = None
             self._tokenizer_backend = "callable"
-            self.token_counter = tokenizer
-        if isinstance(tokenizer, str):
-            self.tokenizer = self._load_tokenizer(tokenizer)
+            self.token_counter = tokenizer_or_token_counter
+        elif isinstance(tokenizer_or_token_counter, str):
+            self.tokenizer = self._load_tokenizer(tokenizer_or_token_counter)
             self.token_counter = self._get_tokenizer_counter()
         else:
-            self.tokenizer = tokenizer
+            self.tokenizer = tokenizer_or_token_counter
             self._tokenizer_backend = self._get_tokenizer_backend()
             self.token_counter = self._get_tokenizer_counter()
 

diff --git a/tests/chunker/test_semantic_chunker.py b/tests/chunker/test_semantic_chunker.py
@@ -85,6 +85,10 @@ def test_semantic_chunker_initialization(embedding_model):
     assert chunker.initial_sentences == 1
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_semantic_chunker_initialization_openai(openai_embedding_model):
     """Test that the SemanticChunker can be initialized with required parameters."""
     chunker = SemanticChunker(

diff --git a/tests/embeddings/test_openai_embeddings.py b/tests/embeddings/test_openai_embeddings.py
@@ -26,18 +26,30 @@ def sample_texts():
     ]
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_initialization_with_model_name():
     embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
     assert embeddings.model == "text-embedding-3-small"
     assert embeddings.client is not None
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_embed_single_text(embedding_model, sample_text):
     embedding = embedding_model.embed(sample_text)
     assert isinstance(embedding, np.ndarray)
     assert embedding.shape == (embedding_model.dimension,)
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_embed_batch_texts(embedding_model, sample_texts):
     embeddings = embedding_model.embed_batch(sample_texts)
     assert isinstance(embeddings, list)
@@ -48,12 +60,20 @@ def test_embed_batch_texts(embedding_model, sample_texts):
     )
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_count_tokens_single_text(embedding_model, sample_text):
     token_count = embedding_model.count_tokens(sample_text)
     assert isinstance(token_count, int)
     assert token_count > 0
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_count_tokens_batch_texts(embedding_model, sample_texts):
     token_counts = embedding_model.count_tokens_batch(sample_texts)
     assert isinstance(token_counts, list)
@@ -62,13 +82,21 @@ def test_count_tokens_batch_texts(embedding_model, sample_texts):
     assert all(count > 0 for count in token_counts)
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_similarity(embedding_model, sample_texts):
     embeddings = embedding_model.embed_batch(sample_texts)
     similarity_score = embedding_model.similarity(embeddings[0], embeddings[1])
     assert isinstance(similarity_score, float)
     assert 0.0 <= similarity_score <= 1.0
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_dimension_property(embedding_model):
     assert isinstance(embedding_model.dimension, int)
     assert embedding_model.dimension > 0
@@ -78,6 +106,10 @@ def test_is_available():
     assert OpenAIEmbeddings.is_available() is True
 
 
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ,
+    reason="Skipping test because OPENAI_API_KEY is not defined",
+)
 def test_repr(embedding_model):
     repr_str = repr(embedding_model)
     assert isinstance(repr_str, str)