diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 0000000..b231667 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0964e87..ec83271 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,3 +19,11 @@ repos: - sentence-transformers>=3.0.1 - tiktoken>=0.7.0 - tqdm>=4.66.4 + + - repo: https://github.com/codespell-project/codespell + # Configuration for codespell is in pyproject.toml + rev: v2.3.0 + hooks: + - id: codespell + additional_dependencies: + - tomli diff --git a/README.md b/README.md index f01ae33..6a12bdc 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ LOTUS makes LLM-powered data processing fast and easy. -LOTUS (**L**LMs **O**ver **T**ables of **U**nstructured and **S**tructured Data) provides a declarative programming model and an optimized query engine for serving powerful reasoning-based query pipelines over structured and unstructured data! We provide a simple and intuitive Pandas-like API, that implements **semantic operators**. +LOTUS (**L**LMs **O**ver **T**ables of **U**nstructured and **S**structured Data) provides a declarative programming model and an optimized query engine for serving powerful reasoning-based query pipelines over structured and unstructured data! We provide a simple and intuitive Pandas-like API, that implements **semantic operators**. For trouble-shooting or feature requests, please raise an issue and we'll get to it promptly. To share feedback and applications you're working on, you can send us a message on our [community slack](https://join.slack.com/t/lotus-fnm8919/shared_invite/zt-2tnq6948j-juGuSIR0__fsh~kUmZ6TJw), or send an email (lianapat@stanford.edu). @@ -88,7 +88,7 @@ LOTUS offers a number of semantic operators in a Pandas-like API, some of which | sem_filter | Keep records that match the natural language predicate | | sem_extract | Extract one or more attributes from each row | | sem_agg | Aggregate across all records (e.g. for summarization) | -| sem_topk | Order the records by some natural langauge sorting criteria | +| sem_topk | Order the records by some natural language sorting criteria | | sem_join | Join two datasets based on a natural language predicate | | sem_sim_join | Join two DataFrames based on semantic similarity | | sem_search | Perform semantic search the over a text column | diff --git a/docs/approximation_cascades.rst b/docs/approximation_cascades.rst index 9c840b8..43545c3 100644 --- a/docs/approximation_cascades.rst +++ b/docs/approximation_cascades.rst @@ -8,11 +8,11 @@ LOTUS serves approximations for semantic operators to let you balance speed and You can set accurayc targets according to the requirements of your application, and LOTUS will use approximations to optimize the implementation for lower computaitonal overhead, while providing probabilistic accuracy guarantees. One core technique for providing these approximations is the use of cascades. -Cascades provide a way to optimize certian semantic operators (Join Cascade and Filter Cascade) by blending +Cascades provide a way to optimize certain semantic operators (Join Cascade and Filter Cascade) by blending a less costly but potentially inaccurate proxy model with a high-quality oracle model. The method seeks to achieve preset precision and recall targets with a given probability while controlling computational overhead. -Cascades work by intially using a cheap approximation to score and filters/joins tuples. Using statistically +Cascades work by initially using a cheap approximation to score and filters/joins tuples. Using statistically supported thresholds found from sampling prior, it then assigns each tuple to one of three actions based on the proxy's score: accept, reject, or seek clarification from the oracle model. diff --git a/docs/configurations.rst b/docs/configurations.rst index 16cec15..8ac5c72 100644 --- a/docs/configurations.rst +++ b/docs/configurations.rst @@ -22,7 +22,7 @@ Configurable Parameters -------------------------- 1. enable_message_cache: - * Description: Enables or Disables cahcing mechanisms + * Description: Enables or Disables caching mechanisms * Default: False * Parameters: - cache_type: Type of caching (SQLITE or In_MEMORY) diff --git a/docs/core_concepts.rst b/docs/core_concepts.rst index 01a589b..334a993 100644 --- a/docs/core_concepts.rst +++ b/docs/core_concepts.rst @@ -2,10 +2,10 @@ Core Concepts ================== LOTUS' implements the semantic operator programming model. Semantic operators are declarative transformations over one or more -datasets, parameterized by a natural langauge expression (*langex*) that can be implemnted by a variety of AI-based algorithms. +datasets, parameterized by a natural language expression (*langex*) that can be implemented by a variety of AI-based algorithms. Semantic operators seamlessly extend the relational model, operating over datasets that may contain traditional structured data as well as unstructured fields, such as free-form text or images. Because semantic operators are composable, modular and declarative, they allow you to write -AI-based piplines with intuitive, high-level logic, leaving the rest of the work to the query engine! Each operator can be implmented and +AI-based pipelines with intuitive, high-level logic, leaving the rest of the work to the query engine! Each operator can be implemented and optimized in multiple ways, opening a rich space for execution plans, similar to relational operators. Here is a quick example of semantic operators in action: .. code-block:: python @@ -28,7 +28,7 @@ Here are some key semantic operators: +--------------+----------------------------------------------------------+ | sem_agg | Aggregate across all records (e.g. for summarization) | +--------------+----------------------------------------------------------+ -| sem_topk | Order records by the natural langauge ranking criteria | +| sem_topk | Order records by the natural language ranking criteria | +--------------+----------------------------------------------------------+ | sem_join | Join two datasets based on a natural language predicate | +--------------+----------------------------------------------------------+ diff --git a/docs/multimodal_models.rst b/docs/multimodal_models.rst index 16ef9c6..977c513 100644 --- a/docs/multimodal_models.rst +++ b/docs/multimodal_models.rst @@ -11,7 +11,7 @@ PIL images, numpy arrays, base64 strings, and image URLs Initializing ImageArray ----------------------- The ImageArray class is an extension array designed to handle images as data types in pandas. -You can initilize an ImageArray with a list of supported image formats +You can initialize an ImageArray with a list of supported image formats .. code-block:: python diff --git a/docs/sem_index.rst b/docs/sem_index.rst index 3e7bf74..c70ac62 100644 --- a/docs/sem_index.rst +++ b/docs/sem_index.rst @@ -5,7 +5,7 @@ Overview --------- The sem_index operator in LOTUS creates a semantic index over the specified column in the dataset. This index enables efficient retrieval and ranking of records based on semantic similarity. -The index will be generated with the configured retreival model stored locally in the specified directory. +The index will be generated with the configured retrieval model stored locally in the specified directory. Example diff --git a/docs/sem_map.rst b/docs/sem_map.rst index 589b91b..b871cf6 100644 --- a/docs/sem_map.rst +++ b/docs/sem_map.rst @@ -3,7 +3,7 @@ sem_map Overview ---------- -This operato performs a semantic projection over an input column. The langex parameter specifies this projection in natural language. +This operator performs a semantic projection over an input column. The langex parameter specifies this projection in natural language. Motivation ----------- diff --git a/docs/sem_sim_join.rst b/docs/sem_sim_join.rst index 79145a1..e6089e4 100644 --- a/docs/sem_sim_join.rst +++ b/docs/sem_sim_join.rst @@ -3,7 +3,7 @@ sem_sim_join Overview --------- -The similairty join matches tuples from the right and left table according to their semantic similarity, rather than an arbitrary +The similarity join matches tuples from the right and left table according to their semantic similarity, rather than an arbitrary natural-language predicate. Akin to an equi-join in standard relational algebra, the semantic similarity join is a specialized semantic join, can be heavily optimized using the semantic index. diff --git a/docs/sem_topk.rst b/docs/sem_topk.rst index d57f093..21a06bb 100644 --- a/docs/sem_topk.rst +++ b/docs/sem_topk.rst @@ -59,9 +59,9 @@ Required Parameters - **user_instruction** : The user instruction for sorting. - **K**: The number of rows to return. -Optional Paramaters ---------------------- +Optional Parameters +-------------------- - **method** : The method to use for sorting. Options are "quick", "heap", "naive", "quick-sem". - **group_by** : The columns to group by before sorting. Each group will be sorted separately. - **cascade_threshold**: The confidence threshold for cascading to a larger model. -- **return_stats** : Whether to return stats. \ No newline at end of file +- **return_stats** : Whether to return stats. diff --git a/examples/model_examples/cache.py b/examples/model_examples/cache.py index 95bc282..cc3ab95 100644 --- a/examples/model_examples/cache.py +++ b/examples/model_examples/cache.py @@ -23,7 +23,7 @@ df = pd.DataFrame(data) user_instruction = "{Course Name} requires a lot of math" df = df.sem_filter(user_instruction) -print("====== intial run ======") +print("====== initial run ======") print(df) # run a second time diff --git a/lotus/models/reranker.py b/lotus/models/reranker.py index a7fd599..c6ffda9 100644 --- a/lotus/models/reranker.py +++ b/lotus/models/reranker.py @@ -19,6 +19,6 @@ def __call__(self, query: str, docs: list[str], K: int) -> RerankerOutput: K (int): The number of documents to keep after reranking. Returns: - RerankerOutput: The indicies of the reranked documents. + RerankerOutput: The indices of the reranked documents. """ pass diff --git a/lotus/sem_ops/postprocessors.py b/lotus/sem_ops/postprocessors.py index d531099..06d875a 100644 --- a/lotus/sem_ops/postprocessors.py +++ b/lotus/sem_ops/postprocessors.py @@ -61,7 +61,7 @@ def extract_postprocess(llm_answers: list[str]) -> SemanticExtractPostprocessOut Postprocess the output of the extract operator to extract the schema. Args: - llm_answers (list[str]): The list of llm answers containging the extract. + llm_answers (list[str]): The list of llm answers containing the extract. Returns: SemanticExtractPostprocessOutput diff --git a/lotus/sem_ops/sem_join.py b/lotus/sem_ops/sem_join.py index 0050f49..04a2a5d 100644 --- a/lotus/sem_ops/sem_join.py +++ b/lotus/sem_ops/sem_join.py @@ -384,7 +384,7 @@ def join_optimizer( int: The number of LM calls from optimizing join plan. """ - # Helper is currently default to similiarity join + # Helper is currently default to similarity join if lotus.settings.helper_lm is not None: lotus.logger.debug("Helper model is not supported yet. Default to similarity join.") diff --git a/lotus/utils.py b/lotus/utils.py index b3e68ac..6f38ff6 100644 --- a/lotus/utils.py +++ b/lotus/utils.py @@ -56,7 +56,7 @@ def ret( rm.load_index(col_index_dir) assert rm.index_dir == col_index_dir - ids = df.index.tolist() # assumes df index hasn't been resest and corresponds to faiss index ids + ids = df.index.tolist() # assumes df index hasn't been reset and corresponds to faiss index ids vec_set = rm.get_vectors_from_index(col_index_dir, ids) d = vec_set.shape[1] kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose) diff --git a/pyproject.toml b/pyproject.toml index e0e09be..5e33f39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,4 +74,10 @@ line-ending = "auto" [tool.mypy] python_version = "3.10" strict = true -ignore_missing_imports = true \ No newline at end of file +ignore_missing_imports = true +[tool.codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = '.git*' +check-hidden = true +ignore-regex = '\bParth\b' +ignore-words-list = 'ans'