Nle lc (#133)

* Added langchain and improve NLE * Tweaked prompt * Updated notebook * Added correlation direction * More work on prompt * Updated prompt for single vs multi molecules * More prompt refinement * Model changes * Removed oai key dependency * Updated notebook experiments * Fixed old text generate code * Addded OAI key * Added openai to dev requirements * Added OAI Key and uncommented explains
ur-whitelab · Mar 10, 2023 · 1aa7e14 · 1aa7e14
1 parent e0d0bea
commit 1aa7e14
Show file tree

Hide file tree

Showing 11 changed files with 144 additions and 133 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -29,6 +29,8 @@ jobs:
       run: |
         pip install .
     - name: Run Test
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: |
          pytest tests --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
          # mypy -p exmol --ignore-missing-imports

diff --git a/.github/workflows/paper.yml b/.github/workflows/paper.yml
@@ -28,14 +28,20 @@ jobs:
       run: |
         pip install -r paper1_CFs/requirements.txt
     - name: Run paper1 experiments
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: jupyter nbconvert --ExecutePreprocessor.timeout=-1 --execute "paper1_CFs/*.ipynb" --to notebook --output-dir='temp' --clear-output
     - name: Install paper2 depends
       run: |
         pip install -r paper2_LIME/requirements.txt
     - name: Run paper2 experiments
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: jupyter nbconvert --ExecutePreprocessor.timeout=-1 --execute "paper2_LIME/*.ipynb" --to notebook --output-dir='temp' --clear-output
     - name: Install paper3 depends
       run: |
         pip install -r paper3_Scents/requirements.txt
     - name: Run paper3 experiments
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: jupyter nbconvert --ExecutePreprocessor.timeout=-1 --execute "paper3_Scents/*.ipynb" --to notebook --output-dir='temp' --clear-output
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -32,6 +32,8 @@ jobs:
       run: |
         pip install .
     - name: Run Test
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: |
          pytest tests
          # mypy -p exmol --ignore-missing-imports
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,10 +1,10 @@
 Change Log
 ==========
 
-v3.0.2
+v3.0.2 (2023-02-23)
 -------------------
 * Now try to find largest component with multiple compounds separated via `.` in SMILES
-
+* Added langchain for text explanations
 
 
 v3.0.1 (2023-02-02)

diff --git a/exmol/exmol.py b/exmol/exmol.py
@@ -26,7 +26,8 @@
 from rdkit.Chem.Draw import MolToImage as mol2img, DrawMorganBit  # type: ignore
 from rdkit.Chem import rdchem  # type: ignore
 from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity, TanimotoSimilarity  # type: ignore
-
+import langchain.llms as llms
+import langchain.prompts as prompts
 
 from . import stoned
 from .plot_utils import _mol_images, _image_scatter, _bit2atoms
@@ -1331,45 +1332,66 @@ def merge_text_explains(
     return pos + joint
 
 
-_text_prompt = """
-The following are a series of questions about molecules that connect their structure to a property, along with how important each question is for the molecular property. An answer of "Yes" means that the question was true and that attribute of structure contributed to the molecular property. An answer of "Counterfactual" means the lack of that attribute contributed to the molecular property. A summary paragraph is given below, which only summarizes on the most important structure-property relationships.
-
-Property: [PROPERTY]
-[TEXT]
-Summary: The molecular property "[PROPERTY]" can be explained"""
-
-
-def text_prompt(
+_multi_prompt = (
+    "The following is information about molecules that connect their structures "
+    'to the property called "{property}." '
+    "The information is attributes of molecules expressed as questions with answers and "
+    "relative importance. "
+    "Using all aspects of this information, propose an explanation (50-150 words) "
+    'for the molecular property "{property}." '
+    "Only use the information below. Answer in a scientific "
+    'tone and make use of counterfactuals (e.g., "If X were present, {property} would be negatively...").'
+    "\n\n"
+    "{text}\n\n"
+    "Explanation:"
+)
+
+_single_prompt = (
+    "The following is information about a specific molecule that connects its structure "
+    'to the property "{property}." '
+    "The information is structural attributes expressed as questions with answers and "
+    "relative importance. "
+    "Using all aspects of this information, propose an explanation (50-150 words) "
+    'for this molecule\'s property "{property}." '
+    "Only use the information below. Answer in a scientific "
+    'tone and make use of counterfactuals (e.g., "If X were present, its {property} would be negatively...").'
+    "\n\n"
+    "{text}\n\n"
+    "Explanation:"
+)
+
+
+def text_explain_generate(
     text_explanations: List[Tuple[str, float]],
     property_name: str,
-    open_ai_key: Optional[str] = None,
+    llm: Optional[llms.BaseLLM] = None,
+    single: bool = True,
 ) -> str:
-    """Insert text explanations into template, and optionally send to OpenAI."""
-    result = _text_prompt.replace("[PROPERTY]", property_name)
+    """Insert text explanations into template, and generate explanation.
+
+    Args:
+        text_explanations: List of text explanations.
+        property_name: Name of property.
+        llm: Language model to use.
+        single: Whether to use a prompt about a single molecule or multiple molecules.
+    """
     # want to have negative examples at the end
     text_explanations.sort(key=lambda x: x[1], reverse=True)
-    result = result.replace("[TEXT]", "".join([f"{t[0]}" for t in text_explanations]))
-    if open_ai_key is not None:
-        import openai
-
-        openai.api_key = open_ai_key
-        response = openai.Completion.create(
-            model="text-davinci-003",
-            prompt=result,
-            temperature=0.7,
-            max_tokens=256,
-            top_p=1,
-            frequency_penalty=0,
-            presence_penalty=0,
-        )
-        completion = response["choices"][0]["text"]
-        return (
-            'The molecular property "'
-            + property_name
-            + '" can be explained'
-            + completion
-        )
-    return result
+    text = "\n".join(
+        [
+            # f"{x[0][:-1]} {'Positive' if x[1] > 0 else 'Negative'} correlation."
+            f"{x[0][:-1]}."
+            for x in text_explanations
+        ]
+    )
+    prompt_template = prompts.PromptTemplate(
+        input_variables=["property", "text"],
+        template=_single_prompt if single else _multi_prompt,
+    )
+    prompt = prompt_template.format(property=property_name, text=text)
+    if llm is None:
+        llm = llms.OpenAI(temperature=0.05)
+    return llm(prompt)
 
 
 def text_explain(
@@ -1444,10 +1466,10 @@ def text_explain(
             if neg_count == count - 2:
                 # don't want to have only negative examples
                 continue
-            kind = "No (Counterfactual)."
+            kind = "No and it would be negatively correlated with property (counterfactual)."
             neg_count += 1
         elif present / nbases > presence_thresh and v > 0:
-            kind = "Yes."
+            kind = "Yes and this is positively correlated with property."
             pos_count += 1
         else:
             continue