guestrin-lab · dhruviyer · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024
diff --git a/examples/op_examples/agg_with_grouping.py b/examples/op_examples/agg_with_grouping.py
@@ -0,0 +1,244 @@
+import time
+
+import pandas as pd
+
+import lotus
+from lotus.models import LM
+
+lm = LM(model="gpt-4o-mini")
+
+lotus.settings.configure(lm=lm)
+
+data = {
+    "Course Name": [
+        "Probability and Random Processes",
+        "Optimization Methods in Engineering",
+        "Digital Design and Integrated Circuits",
+        "Computer Security",
+        "Cooking",
+        "Food Sciences",
+        "Machine Learning",
+        "Data Structures and Algorithms",
+        "Quantum Mechanics",
+        "Organic Chemistry",
+        "Artificial Intelligence",
+        "Robotics",
+        "Thermodynamics",
+        "Fluid Mechanics",
+        "Molecular Biology",
+        "Genetics",
+        "Astrophysics",
+        "Neuroscience",
+        "Microeconomics",
+        "Macroeconomics",
+        "Linear Algebra",
+        "Calculus",
+        "Statistics",
+        "Differential Equations",
+        "Discrete Mathematics",
+        "Number Theory",
+        "Graph Theory",
+        "Topology",
+        "Complex Analysis",
+        "Real Analysis",
+        "Abstract Algebra",
+        "Numerical Methods",
+        "Cryptography",
+        "Network Security",
+        "Operating Systems",
+        "Databases",
+        "Computer Networks",
+        "Software Engineering",
+        "Compilers",
+        "Computer Architecture",
+        "Parallel Computing",
+        "Distributed Systems",
+        "Cloud Computing",
+        "Big Data Analytics",
+        "Natural Language Processing",
+        "Computer Vision",
+        "Reinforcement Learning",
+        "Deep Learning",
+        "Bioinformatics",
+        "Computational Biology",
+        "Systems Biology",
+        "Biochemistry",
+        "Physical Chemistry",
+        "Inorganic Chemistry",
+        "Analytical Chemistry",
+        "Environmental Chemistry",
+        "Materials Science",
+        "Nanotechnology",
+        "Optics",
+        "Electromagnetism",
+        "Nuclear Physics",
+        "Particle Physics",
+        "Cosmology",
+        "Planetary Science",
+        "Geophysics",
+        "Atmospheric Science",
+        "Oceanography",
+        "Ecology",
+        "Evolutionary Biology",
+        "Botany",
+        "Zoology",
+        "Microbiology",
+        "Immunology",
+        "Virology",
+        "Pharmacology",
+        "Physiology",
+        "Anatomy",
+        "Neurobiology",
+        "Cognitive Science",
+        "Psychology",
+        "Sociology",
+        "Anthropology",
+        "Archaeology",
+        "Linguistics",
+        "Philosophy",
+        "Ethics",
+        "Logic",
+        "Political Science",
+        "International Relations",
+        "Public Policy",
+        "Economics",
+        "Finance",
+        "Accounting",
+        "Marketing",
+        "Management",
+        "Entrepreneurship",
+        "Law",
+        "Criminal Justice",
+        "Human Rights",
+        "Environmental Studies",
+        "Sustainability",
+        "Urban Planning",
+        "Architecture",
+        "Civil Engineering",
+        "Mechanical Engineering",
+        "Electrical Engineering",
+        "Chemical Engineering",
+        "Aerospace Engineering",
+        "Biomedical Engineering",
+        "Environmental Engineering",
+    ],
+    "Grade Level": [
+        "High School",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Undergraduate",
+        "Undergraduate",
+        "High School",
+        "Undergraduate",
+        "High School",
+        "Undergraduate",
+        "High School",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Graduate",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "High School",
+        "High School",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Undergraduate",
+        "High School",
+        "Undergraduate",
+        "Undergraduate",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Undergraduate",
+        "Graduate",
+        "Undergraduate",
+        "High School",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Graduate",
+        "High School",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "High School",
+        "High School",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Undergraduate",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "High School",
+        "High School",
+        "Graduate",
+        "Graduate",
+        "High School",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "High School",
+        "High School",
+        "Graduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Undergraduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+        "Graduate",
+    ],
+}
+
+df = pd.DataFrame(data)
+start_time = time.time()
+df = df.sem_agg("Summarize all {Course Name}", group_by=["Grade Level"])
+end_time = time.time()
+print(df._output[0])
+print(f"Total execution time: {end_time - start_time:.2f} seconds")
diff --git a/examples/op_examples/top_k_with_grouping.py b/examples/op_examples/top_k_with_grouping.py
@@ -0,0 +1,60 @@
+import time
+
+import pandas as pd
+
+import lotus
+from lotus.models import LM
+
+lm = LM(model="gpt-4o-mini")
+
+lotus.settings.configure(lm=lm)
+
+data = {
+    "Department": ["Math", "Physics", "Computer Science", "Biology"] * 7,
+    "Course Name": [
+        "Calculus",
+        "Quantum Mechanics",
+        "Data Structures",
+        "Genetics",
+        "Linear Algebra",
+        "Thermodynamics",
+        "Algorithms",
+        "Ecology",
+        "Statistics",
+        "Optics",
+        "Machine Learning",
+        "Molecular Biology",
+        "Number Theory",
+        "Relativity",
+        "Computer Networks",
+        "Evolutionary Biology",
+        "Differential Equations",
+        "Particle Physics",
+        "Operating Systems",
+        "Biochemistry",
+        "Complex Analysis",
+        "Fluid Dynamics",
+        "Artificial Intelligence",
+        "Microbiology",
+        "Topology",
+        "Astrophysics",
+        "Cybersecurity",
+        "Immunology",
+    ],
+}
+
+df = pd.DataFrame(data)
+
+for method in ["quick", "heap", "naive"]:
+    start_time = time.time()
+    sorted_df, stats = df.sem_topk(
+        "Which {Course Name} is the most challenging?",
+        K=2,
+        method=method,
+        return_stats=True,
+        group_by=["Department"],
+    )
+    end_time = time.time()
+    print(sorted_df)
+    print(stats)
+    print(f"Total execution time: {end_time - start_time:.2f} seconds")
diff --git a/lotus/sem_ops/sem_agg.py b/lotus/sem_ops/sem_agg.py
@@ -143,6 +143,11 @@ def __init__(self, pandas_obj: Any):
     def _validate(obj: Any) -> None:
         pass
 
+    @staticmethod
+    def process_group(args):
+        group, user_instruction, all_cols, suffix, progress_bar_desc = args
+        return group.sem_agg(user_instruction, all_cols, suffix, None, progress_bar_desc=progress_bar_desc)
+
     def __call__(
         self,
         user_instruction: str,
@@ -181,19 +186,14 @@ def __call__(
             if column not in self._obj.columns:
                 raise ValueError(f"column {column} not found in DataFrame. Given usr instruction: {user_instruction}")
 
-
-
-
         if group_by:
             grouped = self._obj.groupby(group_by)
-            new_df = pd.DataFrame()
-            for name, group in grouped:
-                res = group.sem_agg(user_instruction, all_cols, suffix, None, progress_bar_desc=progress_bar_desc)
-                new_df = pd.concat([new_df, res])
-            return new_df
-
-
-
+            group_args = [(group, user_instruction, all_cols, suffix, progress_bar_desc) for _, group in grouped]
+            from concurrent.futures import ThreadPoolExecutor
+
+            with ThreadPoolExecutor(max_workers=lotus.settings.parallel_groupby_max_threads) as executor:
+                return pd.concat(list(executor.map(SemAggDataframe.process_group, group_args)))
+
         # Sort df by partition_id if it exists
         if "_lotus_partition_id" in self._obj.columns:
             self._obj = self._obj.sort_values(by="_lotus_partition_id")