From 963e15ad9b9772c82c5357ae925894ce36252315 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 14:55:00 -0700 Subject: [PATCH 1/7] Add LM tests --- .github/tests/lm_tests.py | 110 +++++++++++++++++++++++++++++++++++++ .github/workflows/ruff.yml | 24 +++++++- 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 .github/tests/lm_tests.py diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py new file mode 100644 index 00000000..1c1e6ac1 --- /dev/null +++ b/.github/tests/lm_tests.py @@ -0,0 +1,110 @@ +import pandas as pd + +import lotus +from lotus.models import OpenAIModel + +# Set logger level to DEBUG +lotus.logger.setLevel("DEBUG") + +gpt_4o_mini = OpenAIModel(model="gpt-4o-mini") +gpt_4o = OpenAIModel(model="gpt-4o") +lotus.settings.configure(lm=gpt_4o_mini) + +# Test filter operation on an easy dataframe +data = { + "Text": [ + "I am really exicted to go to class today!", + "I am very sad", + ] +} +df = pd.DataFrame(data) +user_instruction = "{Text} is a positive sentiment" +filtered_df = df.sem_filter(user_instruction) + +expected_df = pd.DataFrame( + { + "Text": [ + "I am really exicted to go to class today!", + ] + } +) + +assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" + +# Test cascade +lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) + +# All filters are resolved by the large model +filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=0, return_stats=True) +assert stats["filters_resolved_by_large_model"] == 0 +assert stats["filters_resolved_by_helper_model"] == 2 +assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" + +# All filters are resolved by the helper model +filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=1, return_stats=True) +assert stats["filters_resolved_by_large_model"] == 2 +assert stats["filters_resolved_by_helper_model"] == 0 +assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" + + +# Test top-k on an easy dataframe +lotus.settings.configure(lm=gpt_4o_mini) +data = { + "Text": [ + "Michael Jordan is a good basketball player", + "Steph Curry is a good basketball player", + "Lionel Messi is a good soccer player", + "Tom Brady is a good football player", + ] +} +df = pd.DataFrame(data) +user_instruction = "Which {Text} is most related to basketball?" +sorted_df = df.sem_topk(user_instruction, K=2, method="naive") + +top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"]) +top_2_actual = set(sorted_df["Text"].values) +assert top_2_expected == top_2_actual, f"Expected {top_2_expected}\n, but got\n{top_2_actual}" + +# Test join on an easy dataframe +data1 = { + "School": [ + "UC Berkeley", + "Stanford", + ] +} + +data2 = {"School Type": ["Public School", "Private School"]} + +df1 = pd.DataFrame(data1) +df2 = pd.DataFrame(data2) +join_instruction = "{School} is a {School Type}" +joined_df = df1.sem_join(df2, join_instruction) +joined_pairs = set(zip(joined_df["School"], joined_df["School Type"])) +expected_pairs = set( + [ + ("UC Berkeley", "Public School"), + ("Stanford", "Private School"), + ] +) +assert joined_pairs == expected_pairs, f"Expected {expected_pairs}\n, but got\n{joined_pairs}" + +# Test map on an easy dataframe with few-shot examples +data = { + "School": [ + "UC Berkeley", + "Carnegie Mellon", + ] +} +df = pd.DataFrame(data) +examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]} +examples_df = pd.DataFrame(examples) +user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation." +df = df.sem_map(user_instruction, examples=examples_df, suffix="State") +pairs = set(zip(df["School"], df["State"])) +expected_pairs = set( + [ + ("UC Berkeley", "CA"), + ("Carnegie Mellon", "PA"), + ] +) +assert pairs == expected_pairs, f"Expected {expected_pairs}\n, but got\n{pairs}" diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index e501002f..19e99fed 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -1,4 +1,4 @@ -name: ruff +name: CI/CD Pipeline on: push: @@ -29,3 +29,25 @@ jobs: - name: Run ruff run: ruff check . + + test: + name: Run LM Tests + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -e . + + - name: Run Python tests + run: python .github/tests/lm_tests.py From e837565e8d9ccf4891759a499b0351e60b7c0a9b Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 15:17:37 -0700 Subject: [PATCH 2/7] Add OPENAI_API_KEY and timeout --- .github/workflows/ruff.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 19e99fed..2cd7cfff 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -33,6 +33,7 @@ jobs: test: name: Run LM Tests runs-on: ubuntu-latest + timeout-minutes: 10 steps: - name: Checkout code @@ -49,5 +50,10 @@ jobs: pip install -r requirements.txt pip install -e . + - name: Set OpenAI API Key + run: echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV + - name: Run Python tests + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: python .github/tests/lm_tests.py From 938cc1dd1cdf3c13469133b6c7600849fbd7f0c0 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 15:21:09 -0700 Subject: [PATCH 3/7] Rename tests --- .github/workflows/{ruff.yml => tests.yml} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename .github/workflows/{ruff.yml => tests.yml} (96%) diff --git a/.github/workflows/ruff.yml b/.github/workflows/tests.yml similarity index 96% rename from .github/workflows/ruff.yml rename to .github/workflows/tests.yml index 2cd7cfff..6799c85e 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: CI/CD Pipeline +name: Tests and Linting on: push: @@ -31,7 +31,7 @@ jobs: run: ruff check . test: - name: Run LM Tests + name: LM Tests runs-on: ubuntu-latest timeout-minutes: 10 From 72cdb22ba08d263da5838e6cb8b616cdfc362a26 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 15:22:16 -0700 Subject: [PATCH 4/7] Decrease timeout to 5 minutes --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6799c85e..8ec51301 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,7 +33,7 @@ jobs: test: name: LM Tests runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 5 steps: - name: Checkout code From d8b1afa1bed06caa903b6a4e625fede3f5b48e50 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 15:39:17 -0700 Subject: [PATCH 5/7] Refactor to pytest --- .github/tests/lm_tests.py | 187 +++++++++++++++++------------------- .github/workflows/tests.yml | 3 +- 2 files changed, 90 insertions(+), 100 deletions(-) diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py index 1c1e6ac1..b9265d52 100644 --- a/.github/tests/lm_tests.py +++ b/.github/tests/lm_tests.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest import lotus from lotus.models import OpenAIModel @@ -6,105 +7,93 @@ # Set logger level to DEBUG lotus.logger.setLevel("DEBUG") -gpt_4o_mini = OpenAIModel(model="gpt-4o-mini") -gpt_4o = OpenAIModel(model="gpt-4o") -lotus.settings.configure(lm=gpt_4o_mini) - -# Test filter operation on an easy dataframe -data = { - "Text": [ - "I am really exicted to go to class today!", - "I am very sad", - ] -} -df = pd.DataFrame(data) -user_instruction = "{Text} is a positive sentiment" -filtered_df = df.sem_filter(user_instruction) - -expected_df = pd.DataFrame( - { + +@pytest.fixture +def setup_models(): + # Setup GPT models + gpt_4o_mini = OpenAIModel(model="gpt-4o-mini") + gpt_4o = OpenAIModel(model="gpt-4o") + return gpt_4o_mini, gpt_4o + + +def test_filter_operation(setup_models): + gpt_4o_mini, _ = setup_models + lotus.settings.configure(lm=gpt_4o_mini) + + # Test filter operation on an easy dataframe + data = {"Text": ["I am really exicted to go to class today!", "I am very sad"]} + df = pd.DataFrame(data) + user_instruction = "{Text} is a positive sentiment" + filtered_df = df.sem_filter(user_instruction) + + expected_df = pd.DataFrame({"Text": ["I am really exicted to go to class today!"]}) + assert filtered_df.equals(expected_df) + + +def test_filter_cascade(setup_models): + gpt_4o_mini, gpt_4o = setup_models + + data = {"Text": ["I am really exicted to go to class today!", "I am very sad"]} + df = pd.DataFrame(data) + user_instruction = "{Text} is a positive sentiment" + + # All filters resolved by the helper model + lotus.settings.configure(lm=gpt_4o_mini, helper_lm=gpt_4o) + filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=0, return_stats=True) + assert stats["filters_resolved_by_large_model"] == 0, stats + assert stats["filters_resolved_by_helper_model"] == 2, stats + expected_df = pd.DataFrame({"Text": ["I am really exicted to go to class today!"]}) + assert filtered_df.equals(expected_df) + + # All filters resolved by the large model + filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=1.01, return_stats=True) + assert stats["filters_resolved_by_large_model"] == 2, stats + assert stats["filters_resolved_by_helper_model"] == 0, stats + assert filtered_df.equals(expected_df) + + +def test_top_k(setup_models): + gpt_4o_mini, _ = setup_models + lotus.settings.configure(lm=gpt_4o_mini) + + data = { "Text": [ - "I am really exicted to go to class today!", + "Lionel Messi is a good soccer player", + "Michael Jordan is a good basketball player", + "Steph Curry is a good basketball player", + "Tom Brady is a good football player", ] } -) - -assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" - -# Test cascade -lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) - -# All filters are resolved by the large model -filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=0, return_stats=True) -assert stats["filters_resolved_by_large_model"] == 0 -assert stats["filters_resolved_by_helper_model"] == 2 -assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" - -# All filters are resolved by the helper model -filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=1, return_stats=True) -assert stats["filters_resolved_by_large_model"] == 2 -assert stats["filters_resolved_by_helper_model"] == 0 -assert filtered_df.equals(expected_df), f"Expected {expected_df}\n, but got\n{filtered_df}" - - -# Test top-k on an easy dataframe -lotus.settings.configure(lm=gpt_4o_mini) -data = { - "Text": [ - "Michael Jordan is a good basketball player", - "Steph Curry is a good basketball player", - "Lionel Messi is a good soccer player", - "Tom Brady is a good football player", - ] -} -df = pd.DataFrame(data) -user_instruction = "Which {Text} is most related to basketball?" -sorted_df = df.sem_topk(user_instruction, K=2, method="naive") - -top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"]) -top_2_actual = set(sorted_df["Text"].values) -assert top_2_expected == top_2_actual, f"Expected {top_2_expected}\n, but got\n{top_2_actual}" - -# Test join on an easy dataframe -data1 = { - "School": [ - "UC Berkeley", - "Stanford", - ] -} - -data2 = {"School Type": ["Public School", "Private School"]} - -df1 = pd.DataFrame(data1) -df2 = pd.DataFrame(data2) -join_instruction = "{School} is a {School Type}" -joined_df = df1.sem_join(df2, join_instruction) -joined_pairs = set(zip(joined_df["School"], joined_df["School Type"])) -expected_pairs = set( - [ - ("UC Berkeley", "Public School"), - ("Stanford", "Private School"), - ] -) -assert joined_pairs == expected_pairs, f"Expected {expected_pairs}\n, but got\n{joined_pairs}" - -# Test map on an easy dataframe with few-shot examples -data = { - "School": [ - "UC Berkeley", - "Carnegie Mellon", - ] -} -df = pd.DataFrame(data) -examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]} -examples_df = pd.DataFrame(examples) -user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation." -df = df.sem_map(user_instruction, examples=examples_df, suffix="State") -pairs = set(zip(df["School"], df["State"])) -expected_pairs = set( - [ - ("UC Berkeley", "CA"), - ("Carnegie Mellon", "PA"), - ] -) -assert pairs == expected_pairs, f"Expected {expected_pairs}\n, but got\n{pairs}" + df = pd.DataFrame(data) + user_instruction = "Which {Text} is most related to basketball?" + sorted_df = df.sem_topk(user_instruction, K=2) + + top_2_expected = set(["Michael Jordan is a good basketball player", "Steph Curry is a good basketball player"]) + top_2_actual = set(sorted_df["Text"].values) + assert top_2_expected == top_2_actual + + +def test_join(): + data1 = {"School": ["UC Berkeley", "Stanford"]} + data2 = {"School Type": ["Public School", "Private School"]} + + df1 = pd.DataFrame(data1) + df2 = pd.DataFrame(data2) + join_instruction = "{School} is a {School Type}" + joined_df = df1.sem_join(df2, join_instruction) + joined_pairs = set(zip(joined_df["School"], joined_df["School Type"])) + expected_pairs = set([("UC Berkeley", "Public School"), ("Stanford", "Private School")]) + assert joined_pairs == expected_pairs + + +def test_map_fewshot(): + data = {"School": ["UC Berkeley", "Carnegie Mellon"]} + df = pd.DataFrame(data) + examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]} + examples_df = pd.DataFrame(examples) + user_instruction = "What state is {School} in? Respond only with the two-letter abbreviation." + df = df.sem_map(user_instruction, examples=examples_df, suffix="State") + + pairs = set(zip(df["School"], df["State"])) + expected_pairs = set([("UC Berkeley", "CA"), ("Carnegie Mellon", "PA")]) + assert pairs == expected_pairs diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8ec51301..5d7d8d71 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,6 +49,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -e . + pip install pytest - name: Set OpenAI API Key run: echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV @@ -56,4 +57,4 @@ jobs: - name: Run Python tests env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: python .github/tests/lm_tests.py + run: pytest .github/tests/lm_tests.py From f3ddb982a729580f2dcde81dc19d19607e883013 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 15:44:37 -0700 Subject: [PATCH 6/7] Fix prompt spelling --- lotus/templates/task_instructions.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lotus/templates/task_instructions.py b/lotus/templates/task_instructions.py index d93d7d24..2a9f3294 100644 --- a/lotus/templates/task_instructions.py +++ b/lotus/templates/task_instructions.py @@ -11,7 +11,7 @@ def filter_formatter_cot( cot_reasoning: List[str], ) -> List[str]: sys_instruction = ( - "The user will povide a claim and some relevant context.\n" + "The user will provide a claim and some relevant context.\n" "Your job is to determine whether the claim is true for the given context.\n" 'First give your reasoning. Then you MUST end your output with "Answer: True or False"' ) @@ -45,7 +45,7 @@ def filter_formatter_zs_cot( user_instruction: str, ) -> List[str]: sys_instruction = ( - "The user will povide a claim and some relevant context.\n" + "The user will provide a claim and some relevant context.\n" "Your job is to determine whether the claim is true for the given context.\n" 'First give your reasoning. Then you MUST end your output with "Answer: True or False"' ) @@ -71,7 +71,7 @@ def filter_formatter( return filter_formatter_zs_cot(df_text, user_instruction) sys_instruction = ( - "The user will povide a claim and some relevant context.\n" + "The user will provide a claim and some relevant context.\n" "Your job is to determine whether the claim is true for the given context.\n" 'You must answer with a single word, "True" or "False".' ) @@ -103,7 +103,7 @@ def map_formatter_cot( cot_reasoning: List[str], ) -> List[str]: sys_instruction = ( - "The user will povide an instruction and some relevant context.\n" + "The user will provide an instruction and some relevant context.\n" "Your job is to answer the user's instruction given the context." "You must give your reasoning and then your final answer" ) @@ -119,7 +119,7 @@ def map_formatter_cot( [ { "role": "user", - "content": f"Context:\n{ex_df_txt}\n\Instruction: {user_instruction}", + "content": f"Context:\n{ex_df_txt}\nInstruction: {user_instruction}", }, { "role": "assistant", @@ -142,7 +142,7 @@ def map_formatter_zs_cot( user_instruction: str, ) -> List[str]: sys_instruction = ( - "The user will povide an instruction and some relevant context.\n" + "The user will provide an instruction and some relevant context.\n" "Your job is to answer the user's instruction given the context." 'First give your reasoning. Then you MUST end your output with "Answer: your answer"' ) @@ -153,7 +153,7 @@ def map_formatter_zs_cot( messages.append( { "role": "user", - "content": f"Context:\n{df_text}\n\Instruction: {user_instruction}", + "content": f"Context:\n{df_text}\nInstruction: {user_instruction}", } ) return messages @@ -173,7 +173,7 @@ def map_formatter( return map_formatter_zs_cot(df_text, user_instruction) sys_instruction = ( - "The user will povide an instruction and some relevant context.\n" + "The user will provide an instruction and some relevant context.\n" "Your job is to answer the user's instruction given the context." ) messages = [ @@ -203,7 +203,7 @@ def map_formatter( def extract_formatter(df_text: str, user_instruction: str) -> List[str]: sys_instruction = ( - "The user will povide an instruction and some relevant context.\n" + "The user will provide an instruction and some relevant context.\n" "Your job is to extract the information requested in the instruction.\n" "Write the response in JSONL format in a single line with the following fields:\n" """{"answer": "your answer", "quotes": "quote from context supporting your answer"}""" From ff09fa3bd8cde8f27a8600819ef73b2a8e85bcc8 Mon Sep 17 00:00:00 2001 From: Sid Jha Date: Mon, 30 Sep 2024 17:03:53 -0700 Subject: [PATCH 7/7] Minor fix --- .github/tests/lm_tests.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py index b9265d52..d7ddacd1 100644 --- a/.github/tests/lm_tests.py +++ b/.github/tests/lm_tests.py @@ -32,13 +32,13 @@ def test_filter_operation(setup_models): def test_filter_cascade(setup_models): gpt_4o_mini, gpt_4o = setup_models + lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) data = {"Text": ["I am really exicted to go to class today!", "I am very sad"]} df = pd.DataFrame(data) user_instruction = "{Text} is a positive sentiment" # All filters resolved by the helper model - lotus.settings.configure(lm=gpt_4o_mini, helper_lm=gpt_4o) filtered_df, stats = df.sem_filter(user_instruction, cascade_threshold=0, return_stats=True) assert stats["filters_resolved_by_large_model"] == 0, stats assert stats["filters_resolved_by_helper_model"] == 2, stats @@ -73,7 +73,10 @@ def test_top_k(setup_models): assert top_2_expected == top_2_actual -def test_join(): +def test_join(setup_models): + gpt_4o_mini, _ = setup_models + lotus.settings.configure(lm=gpt_4o_mini) + data1 = {"School": ["UC Berkeley", "Stanford"]} data2 = {"School Type": ["Public School", "Private School"]} @@ -86,7 +89,10 @@ def test_join(): assert joined_pairs == expected_pairs -def test_map_fewshot(): +def test_map_fewshot(setup_models): + gpt_4o_mini, _ = setup_models + lotus.settings.configure(lm=gpt_4o_mini) + data = {"School": ["UC Berkeley", "Carnegie Mellon"]} df = pd.DataFrame(data) examples = {"School": ["Stanford", "MIT"], "Answer": ["CA", "MA"]}