diff --git a/examples/op_examples/join_cascade.py b/examples/op_examples/join_cascade.py index 06535c0e..1e7ed8a1 100644 --- a/examples/op_examples/join_cascade.py +++ b/examples/op_examples/join_cascade.py @@ -15,81 +15,6 @@ "Data Structures and Algorithms", "Artificial Intelligence", "Natural Language Processing", - "Introduction to Robotics", - "Linear Algebra and Differential Equations", - "Database Systems", - "Cloud Computing", - "Software Engineering", - "Operating Systems", - "Discrete Mathematics", - "Wireless Communication Systems", - "Embedded Systems", - "Advanced Computer Architecture", - "Cryptography and Network Security", - "Big Data Analytics", - "Environmental Studies", - "Genetics and Evolution", - "Human Physiology", - "Introduction to Anthropology", - "Macroeconomics", - "Microeconomics", - "Introduction to Sociology", - "Developmental Psychology", - "Cognitive Science", - "Introduction to Philosophy", - "Ethics and Moral Philosophy", - "History of Western Civilization", - "Art History: Renaissance to Modern", - "World Literature", - "Introduction to Journalism", - "Public Speaking and Communication", - "Creative Writing", - "Introduction to Theater", - "Film Studies", - "Environmental Policy and Law", - "Sustainability and Renewable Energy", - "Urban Planning and Design", - "International Relations", - "Marketing Principles", - "Organizational Behavior", - "Financial Accounting", - "Corporate Finance", - "Operations Research", - "Entrepreneurship and Innovation", - "Introduction to Psychology", - "Biostatistics", - "Social Work Practice", - "Public Health Policy", - "Environmental Ethics", - "History of Political Thought", - "Quantitative Research Methods", - "Comparative Politics", - "Behavioral Economics", - "Sociology of Education", - "Social Psychology", - "Gender Studies", - "Media and Communication Studies", - "Advertising and Brand Strategy", - "Sports Management", - "Introduction to Archaeology", - "Ecology and Conservation Biology", - "Geology and Earth Science", - "Astronomy and Astrophysics", - "Introduction to Meteorology", - "Introduction to Oceanography", - "Introduction to Civil Engineering", - "Material Science and Engineering", - "Structural Engineering", - "Environmental Engineering", - "Energy Systems Engineering", - "Aerodynamics", - "Renewable Energy Systems", - "Transportation Engineering", - "Water Resources Management", - "Principles of Accounting", - "Project Management", - "International Business", - "Business Analytics", ] } @@ -110,23 +35,6 @@ "Circuit Design", "Robotics", "Environmental Science", "Marine Biology", "Urban Planning", "Geography", "Agricultural Science", "Animal Care", "Veterinary Science", "Zoology", "Ecology", "Botany", "Landscape Design", "Baking & Pastry", "Culinary Arts", "Bartending", "Nutrition", "Dietary Planning", "Physical Training", "Yoga", - "Meditation", "Dance", "Music Production", "Audio Engineering", "Voice Acting", "Acting", "Film Production", - "Directing", "Screenwriting", "Set Design", "Costume Design", "Stage Management", "Sound Design", "Lighting Design", - "History", "Art History", "Philosophy", "Religious Studies", "Economics", "Statistics", "Calculus", "Algebra", - "Geometry", "Trigonometry", "Early Childhood Education", "Special Education", "Teaching", "Curriculum Development", - "Educational Psychology", "Instructional Design", "Library Science", "Museum Studies", "Archaeology", - "Content Strategy", "Branding", "Product Design", "Industrial Design", "Supply Chain Analysis", "Manufacturing", - "Logistics", "Warehouse Management", "Inventory Management", "Risk Management", "Compliance", "Auditing", - "Portfolio Management", "Investment Analysis", "Real Estate", "Insurance", "Claims Processing", "Underwriting", - "Tax Preparation", "Financial Planning", "Estate Planning", "Business Analysis", "Data Mining", "Big Data", - "Natural Language Processing", "Speech Recognition", "Machine Vision", "Bioinformatics", "Chemoinformatics", - "Geoinformatics", "Energy Management", "Construction Management", "Property Management", "Facility Management", - "Hotel Management", "Travel Planning", "Event Coordination", "Salesforce", "Customer Relationship Management (CRM)", - "SAP", "Oracle", "Microsoft Excel", "Microsoft Word", "Microsoft PowerPoint", "Microsoft Outlook", "Google Sheets", - "Google Docs", "Slack", "Trello", "JIRA", "Confluence", "Asana", "Adobe Photoshop", "Adobe Illustrator", - "Adobe Premiere Pro", "Adobe After Effects", "CorelDRAW", "Figma", "Sketch", "Canva", "Final Cut Pro", "Unity", - "Unreal Engine", "Game Design", "Game Development", "Simulation Modeling", "Virtual Reality", "Augmented Reality", - "Quantum Computing", "3D Printing", "Nanotechnology" ] data2 = pd.DataFrame({"Skill": skills}) @@ -134,5 +42,10 @@ df1 = pd.DataFrame(data) df2 = pd.DataFrame(data2) join_instruction = "Taking {Course Name:left} will help me learn {Skill:right}" -res = df1.sem_join(df2, join_instruction, recall_target = 0.9, precision_target = 0.9, sampling_percentage = 0.8) +print(f"Joining {df1.shape[0]} rows from df1 with {df2.shape[0]} rows from df2") +print(f"Naive join would require {df1.shape[0]*df2.shape[0]} LM calls") + +res = df1.sem_join(df2, join_instruction, recall_target = 0.7, precision_target = 0.7) +print(f"Joined {df1.shape[0]} rows from df1 with {df2.shape[0]} rows from df2") +print(f"Naive join would require {df1.shape[0]*df2.shape[0]} LM calls") print(res)