diff --git a/docs/approximation_cascades.rst b/docs/approximation_cascades.rst new file mode 100644 index 00000000..9c840b83 --- /dev/null +++ b/docs/approximation_cascades.rst @@ -0,0 +1,89 @@ +Efficient Processing with Approximations +======================= + +Overview +--------------- + +LOTUS serves approximations for semantic operators to let you balance speed and accuracy. +You can set accurayc targets according to the requirements of your application, and LOTUS +will use approximations to optimize the implementation for lower computaitonal overhead, while providing probabilistic accuracy guarantees. +One core technique for providing these approximations is the use of cascades. +Cascades provide a way to optimize certian semantic operators (Join Cascade and Filter Cascade) by blending +a less costly but potentially inaccurate proxy model with a high-quality oracle model. The method seeks to achieve +preset precision and recall targets with a given probability while controlling computational overhead. + +Cascades work by intially using a cheap approximation to score and filters/joins tuples. Using statistically +supported thresholds found from sampling prior, it then assigns each tuple to one of three actions based on the +proxy's score: accept, reject, or seek clarification from the oracle model. + +When the proxy is accurate, most of the data is resolved quickly and inexpensively, and those not resolved are +sent to the larger LM. + +Using Cascades +---------------- +To use this approximation cascade-based operators, begin by configuring both the main and helper LM using +lotus's configuration settings + +.. code-block:: python + + import lotus + from lotus.models import LM + from lotus.types import CascadeArgs + + + gpt_4o_mini = LM("gpt-4o-mini") + gpt_4o = LM("gpt-4o") + + lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) + + +Once the LMs are set up, specify the cascade parameters-like recall and precision targets, sampling percentage, and +the acceptable failure probability-using the CascadeArgs object. + +.. code-block:: python + + cascade_args = CascadeArgs(recall_target=0.9, precision_target=0.9, sampling_percentage=0.5, failure_probability=0.2) + +After preparing the arguments, call the semantic operator method on the DataFrame + +.. code-block:: python + + df, stats = df.sem_filter(user_instruction=user_instruction, cascade_args=cascade_args, return_stats=True) + +Note that these parameters guide the trade-off between speed and accuracy when applying the cascade operators + +Interpreting Output Statistics +------------------------------- +For cascade operators, Output statistics will contain key performance metrics. + +An Example output statistic: + +.. code-block:: text + + {'pos_cascade_threshold': 0.62, + 'neg_cascade_threshold': 0.52, + 'filters_resolved_by_helper_model': 95, + 'filters_resolved_by_large_model': 8, + 'num_routed_to_helper_model': 95} + +Here is a detailed explanation of each metric + +1. **pos_cascade_threshold** + The Minimum score above which tuples are automatically rejected by the helper model. In the above example, any tuple with a + score above 0.62 is accepted without the need for the oracle LM. + +2. **neg_cascade_threshold** + The maximum score below which tuples are automatically rejected by the helper model. + Any tuple scoring below 0.52 is rejected without involving the oracle LM. + +3. **filters_resolved_by_helper_model** + The number of tuples conclusively classified by the helper model. + A value of 95 indicates that the majority of items were efficiently handled at this stage. + +4. **filters_resolved_by_large_model** + The count of tuples requiring the oracle model’s intervention. + Here, only 8 items needed escalation, suggesting that the chosen thresholds are effective. + +5. **num_routed_to_helper_model** + The total number of items initially processed by the helper model. + Since 95 items were routed, and only 8 required the oracle, this shows a favorable balance between cost and accuracy. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index a9bf8b8f..8ae24779 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ project = "LOTUS" copyright = "2024, Liana Patel, Siddharth Jha, Carlos Guestrin, Matei Zaharia" author = "Liana Patel, Siddharth Jha, Carlos Guestrin, Matei Zaharia" -release = "0.3.0" +release = "1.0.1" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/configurations.rst b/docs/configurations.rst new file mode 100644 index 00000000..fad61413 --- /dev/null +++ b/docs/configurations.rst @@ -0,0 +1,46 @@ +Setting Configurations +======================= + +The Settings module is a central configuration system for managing application-wide settings. +It ensures consistent and thread-safe access to configurations, allowing settings to be dynamically +adjusted and temporarily overridden within specific contexts. In most examples seen, we have +used the settings to configured our LM. + +Using the Settings module +-------------------------- +.. code-block:: python + + from lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=lm) + +Configurable Parameters +-------------------------- + +1. enable_cache: + * Description: Enables or Disables cahcing mechanisms + * Default: False +.. code-block:: python + + lotus.settings.configure(enable_cache=True) + +2. setting RM: + * Description: Configures the retrieval model + * Default: None +.. code-block:: python + + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + lotus.settings.configure(rm=rm) + +3. setting helper_lm: + * Descriptions: Configures secondary helper LM often set along with primary LM + * Default: None +.. code-block:: python + + gpt_4o_mini = LM("gpt-4o-mini") + gpt_4o = LM("gpt-4o") + + lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) + diff --git a/docs/core_concepts.rst b/docs/core_concepts.rst new file mode 100644 index 00000000..cc0fcaf4 --- /dev/null +++ b/docs/core_concepts.rst @@ -0,0 +1,41 @@ +Core Concepts +================== + +LOTUS' implements the semantic operator programming model. Semantic operators are declarative transformations over one or more +datasets, parameterized by a natural langauge expression (*langex*) that can be implemnted by a variety of AI-based algorithms. +Semantic operators seamlessly extend the relational model, operating over datasets that may contain traditional structured data +as well as unstructured fields, such as free-form text or images. Because semantic operators are composable, modular and declarative, they allow you to write +AI-based piplines with intuitive, high-level logic, leaving the rest of the work to the query engine! Each operator can be implmented and +optimized in multiple ways, opening a rich space for execution plans, similar to relational operators. Here is a quick example of semantic operators in action: + +.. code-block:: python + + langex = "The {abstract} suggests that LLMs efficeintly utilize long context" + filtered_df = papers_df.sem_filter(langex) + + +With LOTUS, applications can be built by chaining togethor different operators. Much like relational operators can be used to +transform tables in SQL, LOTUS operators can be use to semantically transform Pandas DataFrames. +Here are some key semantic operators: + + ++--------------+-----------------------------------------------------+ +| Operator | Description | ++==============+=====================================================+ +| sem_map | Map each record using a natural language projection| ++--------------+-----------------------------------------------------+ +| sem_extract | Extract one or more attributes from each row | ++--------------+-----------------------------------------------------+ +| sem_filter | Keep records that match the natural language predicate | ++--------------+-----------------------------------------------------+ +| sem_agg | Aggregate across all records (e.g. for summarization) ++--------------+-----------------------------------------------------+ +| sem_topk | Order the records by some natural langauge sorting criteria | ++--------------+-----------------------------------------------------+ +| sem_join | Join two datasets based on a natural language predicate | ++--------------+-----------------------------------------------------+ +| sem_sim_join | Join two DataFrames based on semantic similarity | ++--------------+-----------------------------------------------------+ +| sem_search | Perform semantic search the over a text column | ++--------------+-----------------------------------------------------+ + diff --git a/docs/quickstart.rst b/docs/examples.rst similarity index 63% rename from docs/quickstart.rst rename to docs/examples.rst index b2fcd059..a957b4ec 100644 --- a/docs/quickstart.rst +++ b/docs/examples.rst @@ -1,44 +1,6 @@ -Quickstart -============ - -LOTUS can be used to easily build LLM applications in a couple steps. - -LOTUS Operators and Data Model ----------------------------------- - -With LOTUS, applications can be built by chaining together operators. Much like relational operators can be used to transform tables in SQL, LOTUS operators can be used to *semantically* transform Pandas dataframes. Here are some key operators: - -+--------------+-----------------------------------------------------+ -| Operator | Description | -+==============+=====================================================+ -| Sem_Map | Map each row of the dataframe | -+--------------+-----------------------------------------------------+ -| Sem_Filter | Keep rows that match a predicate | -+--------------+-----------------------------------------------------+ -| Sem_Agg | Aggregate information across all rows | -+--------------+-----------------------------------------------------+ -| Sem_TopK | Order the dataframe by some criteria | -+--------------+-----------------------------------------------------+ -| Sem_Join | Join two dataframes based on a predicate | -+--------------+-----------------------------------------------------+ -| Sem_Index | Create a semantic index over a column | -+--------------+-----------------------------------------------------+ -| Sem_Search | Search the dataframe for relevant rows | -+--------------+-----------------------------------------------------+ - - -A core principle of LOTUS is to provide users with a declarative interface that separates the user-specified, logical query plan from its underlying implementation. -As such, users program with LOTUS's semantic operators by writing parameterized language expressions (*langex*), rather than directly prompting an underlying LM. -For example, to filter a dataframe of research papers via its abstract column, a LOTUS user may write - -.. code-block:: python - - langex = "The {abstract} suggests that LLMs efficeintly utilize long context" - filtered_df = papers_df.sem_filter(langex) - - Examples -------------------------- +================== + Let's walk through some use cases of LOTUS. First let's configure LOTUS to use GPT-3.5-Turbo for the LLM and E5 as the embedding model. Then let's define a dataset of courses and their descriptions/workloads. @@ -129,4 +91,4 @@ Additionally, let's provide some examples to the model that can be used for demo Respond with just the topic name and nothing else.", examples=examples_df, suffix="Next Topics" ) -Now you've seen how to use LOTUS to build LLM applications in a couple steps! \ No newline at end of file +Now you've seen how to use LOTUS to implement LLM-powered transformations in a couple of steps using semantic operators in LOTUS! \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 8f8be96a..5b9f4866 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,6 +3,11 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. +.. image:: logo_with_text.png + :width: 300px + :height: 170px + :align: center + LOTUS Makes LLM-Powerd Data Processing Fast and Easy ================================================================================= @@ -14,12 +19,49 @@ LOTUS implements the semantic operator programming model and provides an optimiz :caption: Getting Started installation - quickstart + core_concepts + examples .. toctree:: :hidden: :maxdepth: 1 - :caption: Module Documentation + :caption: Semantic Operators + + sem_map + sem_extract + sem_filter + sem_agg + sem_topk + sem_join + sem_search + sem_sim_join + sem_cluster + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Utility Operators + + sem_partition + sem_index + sem_dedup + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Models + + llm + retriever_models + reranker_models + multimodal_models + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Advanced Usage + + approximation_cascades + prompt_strategies + configurations - models_module - sem_ops_module diff --git a/docs/installation.rst b/docs/installation.rst index 017d365a..d00171c8 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,7 +1,7 @@ Installation ============ -Lotus can be installed as a Python library through pip. +LOTUS can be installed as a Python library through pip. Requirements ------------ @@ -12,10 +12,22 @@ Requirements Install with pip ---------------- -You can install Lotus using pip: +You can install LOTUS using pip: .. code-block:: console $ conda create -n lotus python=3.10 -y $ conda activate lotus - $ pip install lotus-ai \ No newline at end of file + $ pip install lotus-ai + +If you are running on mac, please install Faiss via conda: + +.. code-block:: console + + # CPU-only version + $ conda install -c pytorch faiss-cpu=1.9.0 + + # GPU(+CPU) version + $ conda install -c pytorch -c nvidia faiss-gpu=1.9.0 + +For more details, see `Installing FAISS via Conda `_. diff --git a/docs/llm.rst b/docs/llm.rst new file mode 100644 index 00000000..1b47e7e1 --- /dev/null +++ b/docs/llm.rst @@ -0,0 +1,38 @@ +LLM +======= + +The LM class is built on top of the LiteLLM library, and supports any model that is supported by LiteLLM. +Example models include but not limited to: OpenAI, Ollama, vLLM + +.. automodule:: lotus.models.lm + :members: + :show-inheritance: + +Example +--------- +To run a model, you can use the LM class. We use the liteLLMm library to interface with the model. This allows +ypu to use any model provider that is supported by liteLLM + +Creating a LM object for gpt-4o + +.. code-block:: python + + from lotus.models import LM + lm = LM(model="gpt-4o") + +Creating a LM object to use llama3.2 on Ollama + +.. code-block:: python + + from lotus.models import LM + lm = LM(model="ollama/llama3.2") + +Creating a LM object to use Meta-Llama-3-8B-Instruct on vLLM + +.. code-block:: python + + from lotus.models import LM + lm = LM(model='hosted_vllm/meta-llama/Meta-Llama-3-8B-Instruct', + api_base='http://localhost:8000/v1', + max_ctx_len=8000, + max_tokens=1000) \ No newline at end of file diff --git a/docs/logo_with_text.png b/docs/logo_with_text.png new file mode 100644 index 00000000..2f014f64 Binary files /dev/null and b/docs/logo_with_text.png differ diff --git a/docs/models_module.rst b/docs/models_module.rst deleted file mode 100644 index 39bf3cea..00000000 --- a/docs/models_module.rst +++ /dev/null @@ -1,7 +0,0 @@ -Models Module -============= - -.. automodule:: lotus.models - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/multimodal_models.rst b/docs/multimodal_models.rst new file mode 100644 index 00000000..d9334364 --- /dev/null +++ b/docs/multimodal_models.rst @@ -0,0 +1,54 @@ +Multimodal Models +=================== + +Multimodal models combine textual and visual data to perform advanced tasks such as +image captioning, visual questions, and more. The ImageArray class enables handling of +image data within a pandas DataFrame. Currently supports these image formats: +PIL images, numpy arrays, base64 strings, and image URLs + +Initializing ImageArray +----------------------- +The ImageArray class is an extension array designed to handle images as data types in pandas. +You can initilize an ImageArray with a list of supported image formats + +.. code-block:: python + + from PIL import Image + import numpy as np + from lotus.utils import ImageArray + + # Example image inputs + image1 = Image.open("path_to_image1.jpg") + image2 = np.random.randint(0, 255, (100, 100, 3), dtype="uint8") + + # Create an ImageArray + images = ImageArray([image1, image2, None]) + + +Loading ImageArray +------------------- + +The ImageArray supports multiple input formats for loading images. + +- **PIL Images** : Directly pass a PIL image object +- **Numpy Arrays** : Convert numpy arrays to PIL Images automatically +- **Base64 Strings** : Decode base 64 strings into images +- **URLs** : Fetch images from HTTP/HTTPS URLs +- **File Paths** : Load images from local or remote file Paths +- **S3 URLs** : Fetch images stored in S3 buckets + +Example: + +.. code-block:: python + + from lotus.utils import fetch_image + from PIL import Image + + image_path = "path_to_image.jpg" + image_url = "https://example.com/image.png" + base64_image = "data:image/png;base64,..." + + # Load images + pil_image = fetch_image(image_path) + url_image = fetch_image(image_url) + base64_image_obj = fetch_image(base64_image) diff --git a/docs/prompt_strategies.rst b/docs/prompt_strategies.rst new file mode 100644 index 00000000..e18c2799 --- /dev/null +++ b/docs/prompt_strategies.rst @@ -0,0 +1,66 @@ +Prompt Strategies +=================== + +In addition to calling the semantic operators, advanced prompt stratigies can be used to potentially +get or improve the desired output. Two Prompt Strategies that can be used are Chain of Thought (CoT) and +Demonstrations. + +Chain of Thought + Demonstrations: +---------------------------------- +Chain of Thought reasoning refers to structing prompts in a way that guides the model through a step-by-step process +to arrive at a final answer. By breaking down complex tasks into intermediate steps, CoT ensures more accurate and +logical output + +Here is a simple example of using chain of thought with the Semantic Filter operator + +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + + lotus.settings.configure(lm=lm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + ] + } + df = pd.DataFrame(data) + user_instruction = "{Course Name} requires a lot of math" + + example_data = { + "Course Name": ["Machine Learning", "Reaction Mechanisms", "Nordic History"], + "Answer": [True, True, False], + "Reasoning": ["Machine Learning requires a solid understanding of linear alebra and calculus", + "Reaction Engineering requires Ordinary Differential Equations to solve reactor design problems", + "Nordic History has no math involved"] + } + examples = pd.DataFrame(example_data) + + df = df.sem_filter(user_instruction, examples=examples, strategy="cot") + print(df) + +When calling the Semantic Filter operator, we pass in an example DataFrame as well as the CoT strategy, which acts as a guide +for how the model should reason and respond to the given instructions. For instance, in the examples DataFrame + +* "Machine Learning" has an answer of True, with reasoning that it requires a solid understanding of linear algebra and calculus. +* "Reaction Mechanisms" also has an answer of True, justified by its reliance on ordinary differential equations for solving reactor design problems. +* "Nordic History" has an answer of False, as it does not involve any mathematical concepts. + +Using the CoT strategy will provide an output below: + ++---+----------------------------------------+-------------------------------------------------------------------+ +| | Course Name | explanation_filter | ++---+----------------------------------------+-------------------------------------------------------------------+ +| 0 | Probability and Random Processes | Probability and Random Processes is heavily based on... | ++---+----------------------------------------+-------------------------------------------------------------------+ +| 1 | Optimization Methods in Engineering | Optimization Methods in Engineering typically involves... | ++---+----------------------------------------+-------------------------------------------------------------------+ +| 2 | Digital Design and Integrated Circuits | Digital Design and Integrated Circuits typically covers... | ++---+-------------------------------------+----------------------------------------------------------------------+ \ No newline at end of file diff --git a/docs/reranker_models.rst b/docs/reranker_models.rst new file mode 100644 index 00000000..52329786 --- /dev/null +++ b/docs/reranker_models.rst @@ -0,0 +1,27 @@ +ReRanker Models +================== + +Any CrossEncoder from SentenceTransfomers can be used with the CrossEncoderReranker class, by +passing the model name to the model parameter. The LM class and Retrieval model class can also be passed +to the model parameter + +.. automodule:: lotus.models.reranker + :members: + :show-inheritance: + +Example +-------- +Passing the LM, Retrieval, and ReRanker to model parameters + +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, CrossEncoderReranker, SentenceTransformersRM + + lm = LM(model="gpt-4o-mini") + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1") + + lotus.settings.configure(lm=lm, rm=rm, reranker=reranker) \ No newline at end of file diff --git a/docs/retriever_models.rst b/docs/retriever_models.rst new file mode 100644 index 00000000..7f365b3e --- /dev/null +++ b/docs/retriever_models.rst @@ -0,0 +1,40 @@ +Retrieval Models +================== + +Any model from the SentenceTransformers can be used with the SentenceTransformerssRM class, by passing +the model name to the model parameter. Additionally, LiteLLM can be used with any model supported by +LiteLLM + +.. automodule:: lotus.models.rm + :members: + :show-inheritance: + +Example +---------- +Using just the SentenceTransformersRM class + +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import SentenceTransformersRM + + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + + lotus.settings.configure(rm=rm) + + +Using SentenceTransformersRM and gpt-40-mini + +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, LiteLLMRM + + lm = LM(model="gpt-4o-mini") + rm = LiteLLMRM(model="text-embedding-3-small") + + lotus.settings.configure(lm=lm, rm=rm) \ No newline at end of file diff --git a/docs/sem_agg.rst b/docs/sem_agg.rst new file mode 100644 index 00000000..b48cb3b3 --- /dev/null +++ b/docs/sem_agg.rst @@ -0,0 +1,132 @@ +sem_agg +====================== + +.. automodule:: lotus.sem_ops.sem_agg + :members: + :show-inheritance: + + + +Overview +--------- +This operator performs an aggregation over the input relation, with +a langex signature that provides a commutative and associative aggregation function + +Motivation +----------- +Semantic aggregations are useful for tasks, such as summarization and reasoning across multiple rows of the dataset. + + + +Examples +--------- +.. code-block:: python + + import pandas as pd + + import lotus + + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=lm) + + data = { + "ArticleTitle": [ + "Advancements in Quantum Computing", + "Climate Change and Renewable Energy", + "The Rise of Artificial Intelligence", + "A Journey into Deep Space Exploration" + ], + "ArticleContent": [ + """Quantum computing harnesses the properties of quantum mechanics + to perform computations at speeds unimaginable with classical machines. + As research and development progress, emerging quantum algorithms show + great promise in solving previously intractable problems.""", + + """Global temperatures continue to rise, and societies worldwide + are turning to renewable resources like solar and wind power to mitigate + climate change. The shift to green technology is expected to reshape + economies and significantly reduce carbon footprints.""", + + """Artificial Intelligence (AI) has grown rapidly, integrating + into various industries. Machine learning models now enable systems to + learn from massive datasets, improving efficiency and uncovering hidden + patterns. However, ethical concerns about privacy and bias must be addressed.""", + + """Deep space exploration aims to understand the cosmos beyond + our solar system. Recent missions focus on distant exoplanets, black holes, + and interstellar objects. Advancements in propulsion and life support systems + may one day enable human travel to far-off celestial bodies.""" + ] + } + + df = pd.DataFrame(data) + + df = df.sem_agg("Provide a concise summary of all {ArticleContent} in a single paragraph, highlighting the key technological progress and its implications for the future.") + print(df._output[0]) + +Output: + +.. code-block:: text + + "Recent technological advancements are reshaping various fields and have significant implications for the future. + Quantum computing is emerging as a powerful tool capable of solving complex problems at unprecedented speeds, while the + global shift towards renewable energy sources like solar and wind power aims to combat climate change and transform economies. + In the realm of Artificial Intelligence, rapid growth and integration into industries are enhancing efficiency and revealing + hidden data patterns, though ethical concerns regarding privacy and bias persist. Additionally, deep space exploration is + advancing with missions targeting exoplanets and black holes, potentially paving the way for human travel beyond our solar + system through improved propulsion and life support technologies." + +Example with group-by +--------------------- +.. code-block:: python + + import pandas as pd + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=lm) + + # Example DataFrame + data = { + "Category": ["Tech", "Env", "Tech", "Env"], + "ArticleContent": [ + "Quantum computing shows promise in solving complex problems.", + "Renewable energy helps mitigate climate change.", + "AI improves efficiency but raises ethical concerns.", + "New holes in the ozone layer have been found." + ] + } + + df = pd.DataFrame(data) + + # Perform semantic aggregation with groupby + df = df.sem_agg( + "Summarize the {ArticleContent} for each {Category}.", + groupby=["Category"] + ) + + print(df._output) + +Output: + +.. code-block:: text + + 0 Context: Renewable energy plays a crucial role... + 0 Context: Deep space exploration is primarily c... + 0 Context: Quantum computing is emerging as a po... + + + + +Required Parameters +-------------------- +- **user_instructions** : Prompt to pass into LM + +Optional Parameters +-------------------- +- **all_cols** : Whether to use all columns in the dataframe. +- **suffix** : The suffix for the new column +- **group_by** : The columns to group by before aggregation. Each group will be aggregated separately. \ No newline at end of file diff --git a/docs/sem_cluster.rst b/docs/sem_cluster.rst new file mode 100644 index 00000000..f0800d1e --- /dev/null +++ b/docs/sem_cluster.rst @@ -0,0 +1,71 @@ +sem_cluster_by +===================== + +.. automodule:: lotus.sem_ops.sem_cluster_by + :members: + :show-inheritance: + +Overview +--------- +The cluster operator creates groups over the input dataframe according +to semantic similarity. + +Motivation +----------- +Clustering is useful when you would like to group togethe similar records within the dataset. + +Example +--------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, SentenceTransformersRM + + lm = LM(model="gpt-4o-mini") + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + + lotus.settings.configure(lm=lm, rm=rm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + "Cooking", + "Food Sciences", + ] + } + df = pd.DataFrame(data) + df = df.sem_index("Course Name", "course_name_index").sem_cluster_by("Course Name", 2) + print(df) + +Output: + ++---+----------------------------------------+------------+ +| | Course Name | cluster_id | ++---+----------------------------------------+------------+ +| 0 | Probability and Random Processes | 0 | ++---+----------------------------------------+------------+ +| 1 | Optimization Methods in Engineering | 0 | ++---+----------------------------------------+------------+ +| 2 | Digital Design and Integrated Circuits | 0 | ++---+----------------------------------------+------------+ +| 3 | Computer Security | 1 | ++---+----------------------------------------+------------+ +| 4 | Cooking | 1 | ++---+----------------------------------------+------------+ +| 5 | Food Sciences | 1 | ++---+----------------------------------------+------------+ + + +Required Parameters +-------------------- +- **col_name** : The column name to cluster on. +- **ncentroids** : The number of centroids. + +Optional Parameters +--------------------- +- **niter** : The number of iterations. +- **verbose** : Whether to print verbose output. \ No newline at end of file diff --git a/docs/sem_dedup.rst b/docs/sem_dedup.rst new file mode 100644 index 00000000..b6a32514 --- /dev/null +++ b/docs/sem_dedup.rst @@ -0,0 +1,64 @@ +sem_dedup +======================== + +.. automodule:: lotus.sem_ops.sem_dedup + :members: + :show-inheritance: + + +Overview +--------- +Semantic deduplication is a process designed to identify and eliminate semantically +redundant entries from datasets, focusing on meaning rather than exact textual matches. +Entity de-duplication can be implemented as a semantic self-join, but we provide an additional utility function. + +Motivation +----------- +Unlike traditional deduplication techniques, which rely on exact or near-exact string comparisons, +semantic deduplication uses language models to compare the underlying meaning of text entries. +This ensures that even paraphrased or contextually similar items can be identified as duplicates. + +Example +-------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import SentenceTransformersRM + + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + + lotus.settings.configure(rm=rm) + data = { + "Text": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + "I don't know what day it is", + "I don't know what time it is", + "Harry potter and the Sorcerer's Stone", + ] + } + df = pd.DataFrame(data) + df = df.sem_index("Text", "index_dir").sem_dedup("Text", threshold=0.815) + print(df) + +Output: + ++---+------------------------------------------+ +| | Text | ++---+------------------------------------------+ +| 0 | Probability and Random Processes | ++---+------------------------------------------+ +| 5 | I don't know what time it is | ++---+------------------------------------------+ +| 6 | Harry Potter and the Sorcerer's Stone | ++---+------------------------------------------+ + +Required Parameters +-------------------- +- **col_name** : The column name to deduplicate on +- **threshold** : The threshold for similarity score + diff --git a/docs/sem_extract.rst b/docs/sem_extract.rst new file mode 100644 index 00000000..4412e689 --- /dev/null +++ b/docs/sem_extract.rst @@ -0,0 +1,95 @@ +sem_extract +================== + +.. automodule:: lotus.sem_ops.sem_extract + :members: + :show-inheritance: + +Overview +--------- +The sem_extract operator generates one or more columns from the input columns. +Each output columns is specified by a natural language projection. +Optionally, you can also extract direct quotes from the source text to support each output. + + +Motivation +----------- +Semantic extractions can be useful for generating structured schemas that provide a simplified view of the data from a column of unstructured documents. +The quoting functionality can also be useful for tasks, such as entity extraction or fact-checking, where finding snippets or verified quotes +may be preferable to synthesized answers. + + +Example +-------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=lm) + + df = pd.DataFrame( + { + "description": [ + "Yoshi is 25 years old", + "Bowser is 45 years old", + "Luigi is 15 years old", + ] + } + ) + input_cols = ["description"] + + # A description can be specified for each output column + output_cols = { + "masked_col_1": "The name of the person", + "masked_col_2": "The age of the person", + } + + # you can optionally set extract_quotes=True to return quotes that support each output + new_df = df.sem_extract(input_cols, output_cols, extract_quotes=True) + print(new_df) + + # A description can also be omitted for each output column + output_cols = { + "name": None, + "age": None, + } + new_df = df.sem_extract(input_cols, output_cols) + print(new_df) + +Output: + ++---+--------------------------+---------------+---------------+---------------------+---------------------+ +| | description | masked_col_1 | masked_col_2 | masked_col_1_quote | masked_col_2_quote | ++===+==========================+===============+===============+=====================+=====================+ +| 0 | Yoshi is 25 years old | Yoshi | 25 | Yoshi | 25 years old | ++---+--------------------------+---------------+---------------+---------------------+---------------------+ +| 1 | Bowser is 45 years old | Bowser | 45 | Bowser | 45 years old | ++---+--------------------------+---------------+---------------+---------------------+---------------------+ +| 2 | Luigi is 15 years old | Luigi | 15 | Luigi | 15 years old | ++---+--------------------------+---------------+---------------+---------------------+---------------------+ + ++---+--------------------------+---------------+---------------+ +| | description | masked_col_1 | masked_col_2 | ++===+==========================+===============+===============+ +| 0 | Yoshi is 25 years old | Yoshi | 25 | ++---+--------------------------+---------------+---------------+ +| 1 | Bowser is 45 years old | Bowser | 45 | ++---+--------------------------+---------------+---------------+ +| 2 | Luigi is 15 years old | Luigi | 15 | ++---+--------------------------+---------------+---------------+ + + +Required Parameters +-------------------- +- **input_cols** : The columns that a model should extract from. +- **output_cols** : A mapping from desired output column names to optional descriptions. + +Optional Parameters +-------------------- +- **extract_quotes** : Whether to extract quotes for the output columns. Defaults to False. +- **postprocessor** : The postprocessor for the model outputs. Defaults to extract_postprocess. +- **return_raw_outputs** : Whether to return raw outputs. Defaults to False. \ No newline at end of file diff --git a/docs/sem_filter.rst b/docs/sem_filter.rst new file mode 100644 index 00000000..b6c4913b --- /dev/null +++ b/docs/sem_filter.rst @@ -0,0 +1,218 @@ +sem_filter +================= + +.. automodule:: lotus.sem_ops.sem_filter + :members: + :show-inheritance: + +Overview +--------- +sem_filter, which take a langex predicate, and returns data records that pass the predicate. + +Motivation +----------- +Semantic filtering is a complex yet vital operation in modern data processing, requiring accurate and efficient +evaluation of data rows against nuanced, natural language predicates. Unlike traditional filtering techniques, +which rely on rigid and often simplistic rules, semantic filters must leverage language models to reason contextually about the data. + + +Filter Example +--------------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + + lotus.settings.configure(lm=lm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + ] + } + df = pd.DataFrame(data) + user_instruction = "{Course Name} requires a lot of math" + df = df.sem_filter(user_instruction) + print(df) + +Output: + ++---+---------------------------------------------+ +| | Course Name | ++---+---------------------------------------------+ +| 0 | Probability and Random Processes | ++---+---------------------------------------------+ +| 1 | Optimization Methods in Engineering | ++---+---------------------------------------------+ +| 2 | Digital Design and Integrated Circuits | ++---+---------------------------------------------+ + + + +Example of Filter with Approximation +----------------------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + from lotus.types import CascadeArgs + + + gpt_4o_mini = LM("gpt-4o-mini") + gpt_4o = LM("gpt-4o") + + lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) + data = { + "Course Name": [ + "Probability and Random Processes", "Optimization Methods in Engineering", "Digital Design and Integrated Circuits", + "Computer Security", "Data Structures and Algorithms", "Machine Learning", "Artificial Intelligence", "Natural Language Processing", + "Introduction to Robotics", "Control Systems", "Linear Algebra and Differential Equations", "Database Systems", "Cloud Computing", + "Software Engineering", "Operating Systems", "Discrete Mathematics", "Numerical Methods", "Wireless Communication Systems", + "Embedded Systems", "Advanced Computer Architecture", "Graph Theory", "Cryptography and Network Security", + "Big Data Analytics", "Deep Learning", "Organic Chemistry", "Molecular Biology", "Environmental Science", + "Genetics and Evolution", "Human Physiology", "Introduction to Anthropology", "Cultural Studies", "Political Theory", + "Macroeconomics", "Microeconomics", "Introduction to Sociology", "Developmental Psychology", "Cognitive Science", + "Introduction to Philosophy", "Ethics and Moral Philosophy", "History of Western Civilization", "Art History: Renaissance to Modern", + "World Literature", "Introduction to Journalism", "Public Speaking and Communication", "Creative Writing", "Music Theory", + "Introduction to Theater", "Film Studies", "Environmental Policy and Law", "Sustainability and Renewable Energy", + "Urban Planning and Design", "International Relations", "Marketing Principles", "Organizational Behavior", + "Financial Accounting", "Corporate Finance", "Business Law", "Supply Chain Management", "Operations Research", + "Entrepreneurship and Innovation", "Introduction to Psychology", "Health Economics", "Biostatistics", + "Social Work Practice", "Public Health Policy", "Environmental Ethics", "History of Political Thought", "Quantitative Research Methods", + "Comparative Politics", "Urban Economics", "Behavioral Economics", "Sociology of Education", "Social Psychology", + "Gender Studies", "Media and Communication Studies", "Advertising and Brand Strategy", + "Sports Management", "Introduction to Archaeology", "Ecology and Conservation Biology", "Marine Biology", + "Geology and Earth Science", "Astronomy and Astrophysics", "Introduction to Meteorology", + "Introduction to Oceanography", "Quantum Physics", "Thermodynamics", "Fluid Mechanics", "Solid State Physics", + "Classical Mechanics", "Introduction to Civil Engineering", "Material Science and Engineering", "Structural Engineering", + "Environmental Engineering", "Energy Systems Engineering", "Aerodynamics", "Heat Transfer", + "Renewable Energy Systems", "Transportation Engineering", "Water Resources Management", "Principles of Accounting", + "Project Management", "International Business", "Business Analytics", + ] + } + df = pd.DataFrame(data) + user_instruction = "{Course Name} requires a lot of math" + + cascade_args = CascadeArgs(recall_target=0.9, precision_target=0.9, sampling_percentage=0.5, failure_probability=0.2) + + df, stats = df.sem_filter(user_instruction=user_instruction, cascade_args=cascade_args, return_stats=True) + print(df) + print(stats) + +Output: + ++-----+---------------------------------------------+ +| | Course Name | ++-----+---------------------------------------------+ +| 0 | Probability and Random Processes | ++-----+---------------------------------------------+ +| 1 | Optimization Methods in Engineering | ++-----+---------------------------------------------+ +| 2 | Digital Design and Integrated Circuits | ++-----+---------------------------------------------+ +| 5 | Machine Learning | ++-----+---------------------------------------------+ +| 6 | Artificial Intelligence | ++-----+---------------------------------------------+ +| 7 | Natural Language Processing | ++-----+---------------------------------------------+ +| 8 | Introduction to Robotics | ++-----+---------------------------------------------+ +| 9 | Control Systems | ++-----+---------------------------------------------+ +| 10 | Linear Algebra and Differential Equations | ++-----+---------------------------------------------+ +| 15 | Discrete Mathematics | ++-----+---------------------------------------------+ +| 16 | Numerical Methods | ++-----+---------------------------------------------+ +| 17 | Wireless Communication Systems | ++-----+---------------------------------------------+ +| 19 | Advanced Computer Architecture | ++-----+---------------------------------------------+ +| 20 | Graph Theory | ++-----+---------------------------------------------+ +| 21 | Cryptography and Network Security | ++-----+---------------------------------------------+ +| 22 | Big Data Analytics | ++-----+---------------------------------------------+ +| 23 | Deep Learning | ++-----+---------------------------------------------+ +| 33 | Microeconomics | ++-----+---------------------------------------------+ +| 55 | Corporate Finance | ++-----+---------------------------------------------+ +| 58 | Operations Research | ++-----+---------------------------------------------+ +| 61 | Health Economics | ++-----+---------------------------------------------+ +| 62 | Biostatistics | ++-----+---------------------------------------------+ +| 67 | Quantitative Research Methods | ++-----+---------------------------------------------+ +| 69 | Urban Economics | ++-----+---------------------------------------------+ +| 81 | Astronomy and Astrophysics | ++-----+---------------------------------------------+ +| 84 | Quantum Physics | ++-----+---------------------------------------------+ +| 85 | Thermodynamics | ++-----+---------------------------------------------+ +| 86 | Fluid Mechanics | ++-----+---------------------------------------------+ +| 87 | Solid State Physics | ++-----+---------------------------------------------+ +| 88 | Classical Mechanics | ++-----+---------------------------------------------+ +| 89 | Introduction to Civil Engineering | ++-----+---------------------------------------------+ +| 90 | Material Science and Engineering | ++-----+---------------------------------------------+ +| 91 | Structural Engineering | ++-----+---------------------------------------------+ +| 92 | Environmental Engineering | ++-----+---------------------------------------------+ +| 93 | Energy Systems Engineering | ++-----+---------------------------------------------+ +| 94 | Aerodynamics | ++-----+---------------------------------------------+ +| 95 | Heat Transfer | ++-----+---------------------------------------------+ +| 96 | Renewable Energy Systems | ++-----+---------------------------------------------+ +| 97 | Transportation Engineering | ++-----+---------------------------------------------+ +| 102 | Business Analytics | ++-----+---------------------------------------------+ + +Output Statistics: + +{'pos_cascade_threshold': 0.62, 'neg_cascade_threshold': 0.58, 'filters_resolved_by_helper_model': 101, 'filters_resolved_by_large_model': 2, 'num_routed_to_helper_model': 101} + + +Required Parameters +--------------------- +- **user_instruction** : The user instruction for filtering. + +Optional Parameters +---------------------- +- **return_raw_outputs** : Whether to return raw outputs. Defaults to False. +- **default** : The default value for filtering in case of parsing errors. Defaults to True. +- **suffix** : The suffix for the new columns. Defaults to "_filter". +- **examples** : The examples dataframe. Defaults to None. +- **helper_examples** : The helper examples dataframe. Defaults to None. +- **strategy** : The reasoning strategy. Defaults to None. +- **cascade_args** : The arguments for join cascade. Defaults to None. + recall_target : The target recall. Defaults to None. + precision_target : The target precision when cascading. Defaults to None. + sampling_percentage : The percentage of the data to sample when cascading. Defaults to 0.1. + failure_probability : The failure probability when cascading. Defaults to 0.2. +- **return_stats** : Whether to return statistics. Defaults to False. \ No newline at end of file diff --git a/docs/sem_index.rst b/docs/sem_index.rst new file mode 100644 index 00000000..ff3f2e3a --- /dev/null +++ b/docs/sem_index.rst @@ -0,0 +1,90 @@ +sem_index +================= + +.. automodule:: lotus.sem_ops.load_sem_index + :members: + :show-inheritance: + +Overview +--------- +The sem_index operator in LOTUS creates a semantic index over the specified column in the dataset. +This index enables efficient retrieval and ranking of records based on semantic similarity. +The index will be generated with the configured retreival model stored locally in the specified directory. + + +Example +---------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, CrossEncoderReranker, SentenceTransformersRM + + lm = LM(model="gpt-4o-mini") + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1") + + lotus.settings.configure(lm=lm, rm=rm, reranker=reranker) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + "Introduction to Computer Science", + "Introduction to Data Science", + "Introduction to Machine Learning", + "Introduction to Artificial Intelligence", + "Introduction to Robotics", + "Introduction to Computer Vision", + "Introduction to Natural Language Processing", + "Introduction to Reinforcement Learning", + "Introduction to Deep Learning", + "Introduction to Computer Networks", + ] + } + df = pd.DataFrame(data) + + df = df.sem_index("Course Name", "index_dir") + print(df) + +Output: + ++----+---------------------------------------------+ +| | Course Name | ++----+---------------------------------------------+ +| 0 | Probability and Random Processes | ++----+---------------------------------------------+ +| 1 | Optimization Methods in Engineering | ++----+---------------------------------------------+ +| 2 | Digital Design and Integrated Circuits | ++----+---------------------------------------------+ +| 3 | Computer Security | ++----+---------------------------------------------+ +| 4 | Introduction to Computer Science | ++----+---------------------------------------------+ +| 5 | Introduction to Data Science | ++----+---------------------------------------------+ +| 6 | Introduction to Machine Learning | ++----+---------------------------------------------+ +| 7 | Introduction to Artificial Intelligence | ++----+---------------------------------------------+ +| 8 | Introduction to Robotics | ++----+---------------------------------------------+ +| 9 | Introduction to Computer Vision | ++----+---------------------------------------------+ +| 10 | Introduction to Natural Language Processing | ++----+---------------------------------------------+ +| 11 | Introduction to Reinforcement Learning | ++----+---------------------------------------------+ +| 12 | Introduction to Deep Learning | ++----+---------------------------------------------+ +| 13 | Introduction to Computer Networks | ++----+---------------------------------------------+ + + +Required Parameters +-------------------- +- **col_name** : The column name to index. +- **index_dir** : The directory to save the index. diff --git a/docs/sem_join.rst b/docs/sem_join.rst new file mode 100644 index 00000000..33211d66 --- /dev/null +++ b/docs/sem_join.rst @@ -0,0 +1,162 @@ +sem_join +================= + +.. automodule:: lotus.sem_ops.sem_join + :members: + :show-inheritance: + + +Overview +---------- +The sem_join operator in joins to datasets according to the langex, which specifies a predicate in natural language. + +Motivation +----------- +Traditional join operations often rely on rigid equality conditions, making them unsuitable for scenarios requiring nuanced, +context-aware relationships. The sem_join operator addresses these limitations by enabling semantic matching of rows between +datasets based on natural language predicates + + +Join Example +-------------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + + lotus.settings.configure(lm=lm) + data = { + "Course Name": [ + "History of the Atlantic World", + "Riemannian Geometry", + "Operating Systems", + "Food Science", + "Compilers", + "Intro to computer science", + ] + } + + data2 = {"Skill": ["Math", "Computer Science"]} + + df1 = pd.DataFrame(data) + df2 = pd.DataFrame(data2) + join_instruction = "Taking {Course Name:left} will help me learn {Skill:right}" + res = df1.sem_join(df2, join_instruction) + print(res) + +Output: + ++---+----------------------------+-------------------+ +| | Course Name | Skill | ++---+----------------------------+-------------------+ +| 1 | Riemannian Geometry | Math | ++---+----------------------------+-------------------+ +| 2 | Operating Systems | Computer Science | ++---+----------------------------+-------------------+ +| 4 | Compilers | Computer Science | ++---+----------------------------+-------------------+ +| 5 | Intro to computer science | Computer Science | ++---+----------------------------+-------------------+ + + + +Example of Join with Approximation +---------------------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, SentenceTransformersRM + from lotus.types import CascadeArgs + + lm = LM(model="gpt-4o-mini") + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + + lotus.settings.configure(lm=lm, rm=rm) + data = { + "Course Name": [ + "Digital Design and Integrated Circuits", + "Data Structures and Algorithms", + "The History of Art", + "Natural Language Processing", + ] + } + + skills = [ + "Math", "Computer Science", "Management", "Creative Writing", "Data Analysis", "Machine Learning", "Project Management", + "Problem Solving", "Singing", "Critical Thinking", "Public Speaking", "Teamwork", "Adaptability", "Programming", + "Leadership", "Time Management", "Negotiation", "Decision Making", "Networking", "Painting", + "Customer Service", "Marketing", "Graphic Design", "Nursery", "SEO", "Content Creation", "Video Editing", "Sales", + "Financial Analysis", "Accounting", "Event Planning", "Foreign Languages", "Software Development", "Cybersecurity", + "Social Media Management", "Photography", "Writing & Editing", "Technical Support", "Database Management", "Web Development", + "Business Strategy", "Operations Management", "UI/UX Design", "Reinforcement Learning", "Data Visualization", + "Product Management", "Cloud Computing", "Agile Methodology", "Blockchain", "IT Support", "Legal Research", "Supply Chain Management", + "Copywriting", "Human Resources", "Quality Assurance", "Medical Research", "Healthcare Management", "Sports Coaching", + "Editing & Proofreading", "Legal Writing", "Human Anatomy", "Chemistry", "Physics", "Biology", + "Psychology", "Sociology", "Anthropology", "Political Science", "Public Relations", "Fashion Design", "Interior Design", + "Automotive Repair", "Plumbing", "Carpentry", "Electrical Work", "Welding", "Electronics", "Hardware Engineering", + "Circuit Design", "Robotics", "Environmental Science", "Marine Biology", "Urban Planning", "Geography", + "Agricultural Science", "Animal Care", "Veterinary Science", "Zoology", "Ecology", "Botany", "Landscape Design", + "Baking & Pastry", "Culinary Arts", "Bartending", "Nutrition", "Dietary Planning", "Physical Training", "Yoga", + ] + data2 = pd.DataFrame({"Skill": skills}) + + + df1 = pd.DataFrame(data) + df2 = pd.DataFrame(data2) + join_instruction = "By taking {Course Name:left} I will learn {Skill:right}" + + cascade_args = CascadeArgs(recall_target=0.7, precision_target=0.7) + res, stats = df1.sem_join(df2, join_instruction, cascade_args=cascade_args, return_stats=True) + + + print(f"Joined {df1.shape[0]} rows from df1 with {df2.shape[0]} rows from df2") + print(f" Join cascade took {stats['join_resolved_by_large_model']} LM calls") + print(f" Helper resolved {stats['join_resolved_by_helper_model']} LM calls") + print(f"Join cascade used {stats['total_LM_calls']} LM calls in total") + print(f"Naive join would require {df1.shape[0]*df2.shape[0]} LM calls") + print(res) + +Output: + ++---+----------------------------------------+----------------------+ +| | Course Name | Skill | ++---+----------------------------------------+----------------------+ +| 0 | Digital Design and Integrated Circuits | Circuit Design | ++---+----------------------------------------+----------------------+ +| 3 | Natural Language Processing | Machine Learning | ++---+----------------------------------------+----------------------+ +| 1 | Data Structures and Algorithms | Computer Science | ++---+----------------------------------------+----------------------+ +| 0 | Digital Design and Integrated Circuits | Electronics | ++---+----------------------------------------+----------------------+ +| 0 | Digital Design and Integrated Circuits | Hardware Engineering | ++---+----------------------------------------+----------------------+ + + +Required Parameters +---------------------- +- **other** : The other dataframe or series to join with. +- **join_instruction** : The user instruction for join. + +Optional Parameters +---------------------- +- **return_explanations** : Whether to return explanations. Defaults to False. +- **how** : The type of join to perform. Defaults to "inner". +- **suffix** : The suffix for the new columns. Defaults to "_join". +- **examples** : The examples dataframe. Defaults to None. +- **strategy** : The reasoning strategy. Defaults to None. +- **default** : The default value for the join in case of parsing errors. Defaults to True. +- **cascade_args**: The arguments for join cascade. Defaults to None. + recall_target : The target recall. Defaults to None. + precision_target : The target precision when cascading. Defaults to None. + sampling_percentage : The percentage of the data to sample when cascading. Defaults to 0.1. + failure_probability : The failure probability when cascading. Defaults to 0.2. + map_instruction : The map instruction when cascading. Defaults to None. + map_examples : The map examples when cascading. Defaults to None. +- **return_stats** : Whether to return stats. Defaults to False. \ No newline at end of file diff --git a/docs/sem_map.rst b/docs/sem_map.rst new file mode 100644 index 00000000..c2d384a6 --- /dev/null +++ b/docs/sem_map.rst @@ -0,0 +1,65 @@ +sem_map +================= + +.. automodule:: lotus.sem_ops.sem_map + :members: + :show-inheritance: + +Overview +---------- +This operato performs a semantic projection over an input column. The langex parameter specifies this projection in natural language. +Motivation +----------- +The sem_map operator is useful for performing a row-wise operations over the data. + +Example +---------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + + lotus.settings.configure(lm=lm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + ] + } + df = pd.DataFrame(data) + user_instruction = "What is a similar course to {Course Name}. Be concise." + df = df.sem_map(user_instruction) + print(df) + +Output: + ++---+----------------------------------------+----------------------------------------------------------------+ +| | Course Name | _map | ++===+========================================+================================================================+ +| 0 | Probability and Random Processes | A similar course to "Probability and Random Processes"... | ++---+----------------------------------------+----------------------------------------------------------------+ +| 1 | Optimization Methods in Engineering | A similar course to "Optimization Methods in Engineering"... | ++---+----------------------------------------+----------------------------------------------------------------+ +| 2 | Digital Design and Integrated Circuits | A similar course to "Digital Design and Integrated Circuits"...| ++---+----------------------------------------+----------------------------------------------------------------+ +| 3 | Computer Security | A similar course to "Computer Security" is "Cybersecurity"... | ++---+----------------------------------------+----------------------------------------------------------------+ + +Required Parameters +--------------------- +- **user_instruction** : The user instruction for map. +- **postprocessor** : The postprocessor for the model outputs. Defaults to map_postprocess. + +Optional Parameters +--------------------- +- **return_explanations** : Whether to return explanations. Defaults to False. +- **return_raw_outputs** : Whether to return raw outputs. Defaults to False. +- **suffix** : The suffix for the new columns. Defaults to "_map". +- **examples** : The examples dataframe. Defaults to None. +- **strategy** : The reasoning strategy. Defaults to None. \ No newline at end of file diff --git a/docs/sem_ops_module.rst b/docs/sem_ops_module.rst deleted file mode 100644 index d811746c..00000000 --- a/docs/sem_ops_module.rst +++ /dev/null @@ -1,54 +0,0 @@ -Sem Ops Module -=================== - -.. automodule:: lotus.sem_ops.sem_agg - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_extract - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_filter - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_join - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_map - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_topk - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_dedup - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_index - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_search - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.load_sem_index - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_sim_join - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_cluster_by - :members: - :show-inheritance: - -.. automodule:: lotus.sem_ops.sem_partition_by - :members: - :show-inheritance: diff --git a/docs/sem_partition.rst b/docs/sem_partition.rst new file mode 100644 index 00000000..03addb67 --- /dev/null +++ b/docs/sem_partition.rst @@ -0,0 +1,53 @@ +sem_partition_by +==================== + +.. automodule:: lotus.sem_ops.sem_partition_by + :members: + :show-inheritance: + +Overview +--------- +The sem_partition_by utility in LOTUS exposes a mechanism for finer-grained control over how data is processed for operators, like sem_agg. +This operator let's you assign a partition number to each row in a DataFrame. During semantic aggregation, LOTUS, will aggregate over each partition separately, +before combining intermediate aggregations across partitions. Additionally, the order in which each partition aggregates is combined will follow the order of the partition numbers in increasing order. +By default, LOTUS implements a hierarchical reduce strategy, assuming that all record belong to the same partition. + +Motivation +---------- +Since LLMs are sensitive to the ordering of inputs, specifying an aggregation ordering using sem_partition_by can provide fine-grained control to achieve high quality results for tasks like summarization. + + +Example +---------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, SentenceTransformersRM + + lm = LM(max_tokens=2048) + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + + lotus.settings.configure(lm=lm, rm=rm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + "Cooking", + "Food Sciences", + ] + } + df = pd.DataFrame(data) + df = df.sem_index("Course Name", "course_name_index").sem_partition_by(lotus.utils.cluster("Course Name", 2)) + out = df.sem_agg("Summarize all {Course Name}")._output[0] + print(out) + + +Required Parameters +-------------------- +- **partition_fn** : The partitioning function, which returns a list[int], indicating the partition-id of each row. + + diff --git a/docs/sem_search.rst b/docs/sem_search.rst new file mode 100644 index 00000000..86851b1b --- /dev/null +++ b/docs/sem_search.rst @@ -0,0 +1,85 @@ +sem_search +================== + +.. automodule:: lotus.sem_ops.sem_search + :members: + :show-inheritance: + +Overview +---------- +Semantic search performs similarity-based search over an indexed column. LOTUS also exposes re-ranking functionality for search, +allowing users to specify the n_rerank parameter during the semantic search. The semantic +search in this case will first find the top-𝐾 most relevant documents and then re-rank +the top-𝐾 found documents to return the top n_rerank. + +Motivation +------------ +The sem_search operator is useful for fast, lightweight filtering over your data. + +Example +----------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, CrossEncoderReranker, SentenceTransformersRM + + lm = LM(model="gpt-4o-mini") + rm = SentenceTransformersRM(model="intfloat/e5-base-v2") + reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1") + + lotus.settings.configure(lm=lm, rm=rm, reranker=reranker) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + "Introduction to Computer Science", + "Introduction to Data Science", + "Introduction to Machine Learning", + "Introduction to Artificial Intelligence", + "Introduction to Robotics", + "Introduction to Computer Vision", + "Introduction to Natural Language Processing", + "Introduction to Reinforcement Learning", + "Introduction to Deep Learning", + "Introduction to Computer Networks", + ] + } + df = pd.DataFrame(data) + + df = df.sem_index("Course Name", "index_dir").sem_search( + "Course Name", + "Which course name is most related to computer security?", + K=8, + n_rerank=4, + ) + print(df) + +Output + ++---+-----------------------------------------+ +| | Course Name | ++---+-----------------------------------------+ +| 3 | Computer Security | ++---+-----------------------------------------+ +| 13| Introduction to Computer Networks | ++---+-----------------------------------------+ +| 4 | Introduction to Computer Science | ++---+-----------------------------------------+ +| 5 | Introduction to Data Science | ++---+-----------------------------------------+ + +Required Parameters +--------------------- +- **col_name** : The column name to search on. +- **query** : The query string. + +Optional Parameters +--------------------- +- **K**: The number of documents to retrieve. +- **n_rerank** : The number of documents to rerank. +- **return_scores** : Whether to return the similarity scores. +- **suffix** : The suffix to append to the new column containing the similarity scores. diff --git a/docs/sem_sim_join.rst b/docs/sem_sim_join.rst new file mode 100644 index 00000000..78306dc1 --- /dev/null +++ b/docs/sem_sim_join.rst @@ -0,0 +1,80 @@ +sem_sim_join +========================= + +.. automodule:: lotus.sem_ops.sem_sim_join + :members: + :show-inheritance: + +Overview +--------- +The similairty join matches tuples from the right and left table according to their semantic similarity, rather than an arbitrary +natural-language predicate. Akin to an equi-join in standard relational algebra, the semantic similarity +join is a specialized semantic join, can be heavily optimized using the semantic index. + +Motivation +----------- +This operator is useful for fast and lightweight fuzzy matching of records in two tables based on their semantic similarity. + +Example +--------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM, LiteLLMRM + + lm = LM(model="gpt-4o-mini") + rm = LiteLLMRM(model="text-embedding-3-small") + + lotus.settings.configure(lm=lm, rm=rm) + data = { + "Course Name": [ + "History of the Atlantic World", + "Riemannian Geometry", + "Operating Systems", + "Food Science", + "Compilers", + "Intro to computer science", + ] + } + + data2 = {"Skill": ["Math", "Computer Science"]} + + df1 = pd.DataFrame(data) + df2 = pd.DataFrame(data2).sem_index("Skill", "skill_index") + res = df1.sem_sim_join(df2, left_on="Course Name", right_on="Skill", K=1) + print(res) + +Output: + ++---+------------------------------+----------+-------------------+ +| | Course Name | _scores | Skill | ++---+------------------------------+----------+-------------------+ +| 0 | History of the Atlantic World| 0.107831 | Math | ++---+------------------------------+----------+-------------------+ +| 1 | Riemannian Geometry | 0.345694 | Math | ++---+------------------------------+----------+-------------------+ +| 2 | Operating Systems | 0.426621 | Computer Science | ++---+------------------------------+----------+-------------------+ +| 3 | Food Science | 0.431801 | Computer Science | ++---+------------------------------+----------+-------------------+ +| 4 | Compilers | 0.345494 | Computer Science | ++---+------------------------------+----------+-------------------+ +| 5 | Intro to computer science | 0.676943 | Computer Science | ++---+------------------------------+----------+-------------------+ + + +Required Parameters +-------------------- +- **other** : The other DataFrame to join with. +- **left_on** : The column name to join on in the left DataFrame. +- **right_on** : The column name to join on in the right DataFrame. +- **K** : The number of nearest neighbors to search for. + + +Optional Parameters +-------------------- +- **lsuffix** : The suffix to append to the left DataFrame. +- **rsuffix** : The suffix to append to the right DataFrame. +- **score_suffix** : The suffix to append to the similarity score column. diff --git a/docs/sem_topk.rst b/docs/sem_topk.rst new file mode 100644 index 00000000..a9c4f413 --- /dev/null +++ b/docs/sem_topk.rst @@ -0,0 +1,71 @@ +sem_topk +================ + +.. automodule:: lotus.sem_ops.sem_topk + :members: + :show-inheritance: + +Overview +--------- +LOTUS supports a semantic top-k, which takes the langex ranking criteria. Programmers can optionally +specify a group-by parameter to indicate a subset of columns to group over during ranking. +The groupings are defined using standard equality matches over the group-by columns + +Motivation +----------- +This operator is useful for re-ordering records based on complex, arbitrary natural language comparators. + +Example +-------- +.. code-block:: python + + import pandas as pd + + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + + lotus.settings.configure(lm=lm) + data = { + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + ] + } + df = pd.DataFrame(data) + + for method in ["quick", "heap", "naive"]: + sorted_df, stats = df.sem_topk( + "Which {Course Name} requires the least math?", + K=2, + method=method, + return_stats=True, + ) + print(sorted_df) + print(stats) + + +Output: + ++---+----------------------------------------+ +| | Course Name | ++---+----------------------------------------+ +| 0 | Computer Security | ++---+----------------------------------------+ +| 1 | Digital Design and Integrated Circuits | ++---+----------------------------------------+ + +Required Parameters +-------------------- +- **user_instruction** : The user instruction for sorting. +- **K**: The number of rows to return. + +Optional Paramaters +--------------------- +- **method** : The method to use for sorting. Options are "quick", "heap", "naive", "quick-sem". +- **group_by** : The columns to group by before sorting. Each group will be sorted separately. +- **cascade_threshold**: The confidence threshold for cascading to a larger model. +- **return_stats** : Whether to return stats. \ No newline at end of file diff --git a/lotus/models/litellm_rm.py b/lotus/models/litellm_rm.py index 3c7a06e4..a4486dc6 100644 --- a/lotus/models/litellm_rm.py +++ b/lotus/models/litellm_rm.py @@ -4,6 +4,7 @@ from litellm import embedding from litellm.types.utils import EmbeddingResponse from numpy.typing import NDArray +from tqdm import tqdm from lotus.dtype_extensions import convert_to_base_data from lotus.models.faiss_rm import FaissRM @@ -23,7 +24,7 @@ def __init__( def _embed(self, docs: pd.Series | list) -> NDArray[np.float64]: all_embeddings = [] - for i in range(0, len(docs), self.max_batch_size): + for i in tqdm(range(0, len(docs), self.max_batch_size)): batch = docs[i : i + self.max_batch_size] _batch = convert_to_base_data(batch) response: EmbeddingResponse = embedding(model=self.model, input=_batch) diff --git a/lotus/sem_ops/sem_agg.py b/lotus/sem_ops/sem_agg.py index 43450f76..dfb934ba 100644 --- a/lotus/sem_ops/sem_agg.py +++ b/lotus/sem_ops/sem_agg.py @@ -181,6 +181,9 @@ def __call__( if column not in self._obj.columns: raise ValueError(f"column {column} not found in DataFrame. Given usr instruction: {user_instruction}") + + + if group_by: grouped = self._obj.groupby(group_by) new_df = pd.DataFrame() @@ -188,7 +191,9 @@ def __call__( res = group.sem_agg(user_instruction, all_cols, suffix, None, progress_bar_desc=progress_bar_desc) new_df = pd.concat([new_df, res]) return new_df - + + + # Sort df by partition_id if it exists if "_lotus_partition_id" in self._obj.columns: self._obj = self._obj.sort_values(by="_lotus_partition_id") diff --git a/lotus/sem_ops/sem_cluster_by.py b/lotus/sem_ops/sem_cluster_by.py index b9bf8234..5811101f 100644 --- a/lotus/sem_ops/sem_cluster_by.py +++ b/lotus/sem_ops/sem_cluster_by.py @@ -1,5 +1,6 @@ from typing import Any +import numpy as np import pandas as pd import lotus @@ -22,9 +23,11 @@ def __call__( self, col_name: str, ncentroids: int, + return_scores: bool = False, + return_centroids: bool = False, niter: int = 20, verbose: bool = False, - ) -> pd.DataFrame: + ) -> pd.DataFrame | tuple[pd.DataFrame, np.ndarray]: """ Perform semantic clustering on the DataFrame. @@ -43,7 +46,15 @@ def __call__( ) cluster_fn = lotus.utils.cluster(col_name, ncentroids) + # indices, scores, centroids = cluster_fn(self._obj, niter, verbose) indices = cluster_fn(self._obj, niter, verbose) self._obj["cluster_id"] = pd.Series(indices, index=self._obj.index) + # if return_scores: + # self._obj["centroid_sim_score"] = pd.Series(scores, index=self._obj.index) + + # if return_centroids: + # return self._obj, centroids + # else: + # return self._obj return self._obj diff --git a/lotus/utils.py b/lotus/utils.py index 6689bff3..b3e68ac9 100644 --- a/lotus/utils.py +++ b/lotus/utils.py @@ -27,7 +27,10 @@ def ret( df: pd.DataFrame, niter: int = 20, verbose: bool = False, + method: str = "kmeans", ) -> list[int]: + + import faiss """Cluster by column, and return a series in the dataframe with cluster-ids""" @@ -60,8 +63,12 @@ def ret( kmeans.train(vec_set) # get nearest centroid to each vector - _, indices = kmeans.index.search(vec_set, 1) - return list(map(int, indices.flatten().tolist())) + scores, indices = kmeans.index.search(vec_set, 1) + + # get the cluster centroids + # centroids = kmeans.centroids + # return indices.flatten(), scores.flatten(), centroids + return indices.flatten() return ret