Test set intersect (#138)

* finding intersection between different test sets * update num reactants=5 and min_freq=100
sustainable-processes · Jun 9, 2023 · 8eac53a · 8eac53a
1 parent 9f97292
commit 8eac53a
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 221 deletions.
diff --git a/Makefile b/Makefile
@@ -248,10 +248,10 @@ paper_gen_uspto_no_trust_with_map: #requires: paper_extract_uspto_no_trust
 	python -m orderly.clean --output_path="data/orderly/datasets/orderly_no_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
 
 paper_gen_uspto_with_trust_with_map: #requires: paper_extract_uspto_with_trust
-	python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=10 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=5 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
+	python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
 
 paper_gen_uspto_with_trust_no_map: #requires: paper_extract_uspto_with_trust
-	python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=10 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=5 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
+	python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
 
 paper_6: paper_gen_uspto_no_trust_no_map paper_gen_uspto_no_trust_with_map paper_gen_uspto_with_trust_with_map paper_gen_uspto_with_trust_no_map
 

diff --git a/notebooks/fast_test_train_intersection.ipynb b/notebooks/fast_test_train_intersection.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,230 +29,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "34307\n",
-      "['OCC1CCCO1' None]\n",
-      "58860\n",
-      "['OCC1CCCO1' None]\n",
-      "62541\n",
-      "['OCC1CCCO1' None]\n",
-      "68327\n",
-      "['OCC1CCCO1' None]\n",
-      "68329\n",
-      "['OCC1CCCO1' None]\n",
-      "68652\n",
-      "['OCC1CCCO1' None]\n",
-      "68654\n",
-      "['OCC1CCCO1' None]\n",
-      "74150\n",
-      "['CCN(CC)CC' 'OCC1CCCO1']\n",
-      "101631\n",
-      "['OCC1CCCO1' None]\n",
-      "110332\n",
-      "['OCC1CCCO1' None]\n",
-      "114634\n",
-      "['OCC1CCCO1' None]\n",
-      "127463\n",
-      "['OCC1CCCO1' None]\n",
-      "128001\n",
-      "['OCC1CCCO1' None]\n",
-      "128013\n",
-      "['OCC1CCCO1' None]\n",
-      "128499\n",
-      "['OCC1CCCO1' None]\n",
-      "132968\n",
-      "['OCC1CCCO1' None]\n",
-      "137430\n",
-      "['OCC1CCCO1' None]\n",
-      "141937\n",
-      "['OCC1CCCO1' None]\n",
-      "143416\n",
-      "['OCC1CCCO1' None]\n",
-      "148328\n",
-      "['OCC1CCCO1' None]\n",
-      "149017\n",
-      "['OCC1CCCO1' None]\n",
-      "149827\n",
-      "['OCC1CCCO1' None]\n",
-      "149828\n",
-      "['OCC1CCCO1' None]\n",
-      "151308\n",
-      "['OCC1CCCO1' None]\n",
-      "154726\n",
-      "['OCC1CCCO1' None]\n",
-      "161780\n",
-      "['OCC1CCCO1' None]\n",
-      "187387\n",
-      "['O' 'OCC1CCCO1']\n",
-      "194731\n",
-      "['OCC1CCCO1' None]\n",
-      "195695\n",
-      "['O' 'OCC1CCCO1']\n",
-      "203215\n",
-      "['O' 'OCC1CCCO1']\n",
-      "204637\n",
-      "['O=CO' 'OCC1CCCO1']\n",
-      "210945\n",
-      "['O' 'OCC1CCCO1']\n",
-      "210946\n",
-      "['O' 'OCC1CCCO1']\n",
-      "210948\n",
-      "['O' 'OCC1CCCO1']\n",
-      "210949\n",
-      "['O' 'OCC1CCCO1']\n",
-      "224183\n",
-      "['OCC1CCCO1' None]\n",
-      "228080\n",
-      "['OCC1CCCO1' None]\n",
-      "229343\n",
-      "['OCC1CCCO1' None]\n",
-      "233711\n",
-      "['OCC1CCCO1' None]\n",
-      "245055\n",
-      "['O' 'OCC1CCCO1']\n",
-      "245056\n",
-      "['O' 'OCC1CCCO1']\n",
-      "245058\n",
-      "['O' 'OCC1CCCO1']\n",
-      "245059\n",
-      "['O' 'OCC1CCCO1']\n",
-      "247782\n",
-      "['OCC1CCCO1' None]\n",
-      "259764\n",
-      "['OCC1CCCO1' None]\n",
-      "264401\n",
-      "['OCC1CCCO1' None]\n",
-      "267955\n",
-      "['OCC1CCCO1' None]\n",
-      "268185\n",
-      "['O' 'OCC1CCCO1']\n",
-      "268584\n",
-      "['OCC1CCCO1' None]\n",
-      "278882\n",
-      "['OCC1CCCO1' None]\n",
-      "281471\n",
-      "['OCC1CCCO1' None]\n",
-      "282835\n",
-      "['CCOC(C)=O' 'OCC1CCCO1']\n",
-      "282886\n",
-      "['OCC1CCCO1' None]\n",
-      "282894\n",
-      "['OCC1CCCO1' None]\n",
-      "282901\n",
-      "['OCC1CCCO1' None]\n",
-      "283847\n",
-      "['OCC1CCCO1' None]\n",
-      "287862\n",
-      "['OCC1CCCO1' None]\n",
-      "293441\n",
-      "['OCC1CCCO1' None]\n",
-      "300382\n",
-      "['O' 'OCC1CCCO1']\n",
-      "300397\n",
-      "['O' 'OCC1CCCO1']\n",
-      "300411\n",
-      "['O' 'OCC1CCCO1']\n",
-      "311473\n",
-      "['OCC1CCCO1' None]\n",
-      "312713\n",
-      "['O' 'OCC1CCCO1']\n",
-      "312727\n",
-      "['O' 'OCC1CCCO1']\n",
-      "338143\n",
-      "['OCC1CCCO1' None]\n",
-      "339244\n",
-      "['OCC1CCCO1' None]\n",
-      "339862\n",
-      "['O' 'OCC1CCCO1']\n",
-      "351248\n",
-      "['O' 'OCC1CCCO1']\n",
-      "351249\n",
-      "['OCC1CCCO1' None]\n",
-      "353752\n",
-      "['O' 'OCC1CCCO1']\n",
-      "354705\n",
-      "['OCC1CCCO1' None]\n",
-      "372195\n",
-      "['O' 'OCC1CCCO1']\n",
-      "377535\n",
-      "['O' 'OCC1CCCO1']\n",
-      "382436\n",
-      "['CCOC(C)=O' 'OCC1CCCO1']\n",
-      "401475\n",
-      "['O' 'OCC1CCCO1']\n",
-      "403912\n",
-      "['OCC1CCCO1' None]\n",
-      "408393\n",
-      "['O' 'OCC1CCCO1']\n",
-      "414196\n",
-      "['CCOC(C)=O' 'OCC1CCCO1']\n",
-      "415182\n",
-      "['OCC1CCCO1' None]\n",
-      "415215\n",
-      "['OCC1CCCO1' None]\n",
-      "454790\n",
-      "['O' 'OCC1CCCO1']\n",
-      "458851\n",
-      "['O' 'OCC1CCCO1']\n",
-      "466098\n",
-      "['O' 'OCC1CCCO1']\n",
-      "473298\n",
-      "['OCC1CCCO1' None]\n",
-      "473308\n",
-      "['OCC1CCCO1' None]\n",
-      "480182\n",
-      "['O' 'OCC1CCCO1']\n",
-      "501472\n",
-      "['OCC1CCCO1' None]\n",
-      "508052\n",
-      "['OCC1CCCO1' None]\n",
-      "510500\n",
-      "['OCC1CCCO1' None]\n",
-      "520416\n",
-      "['OCC1CCCO1' None]\n",
-      "539132\n",
-      "['O' 'OCC1CCCO1']\n",
-      "539217\n",
-      "['OCC1CCCO1' None]\n",
-      "542065\n",
-      "['OCC1CCCO1' None]\n",
-      "542072\n",
-      "['OCC1CCCO1' None]\n",
-      "547433\n",
-      "['OCC1CCCO1' None]\n",
-      "547854\n",
-      "['OCC1CCCO1' None]\n",
-      "561913\n",
-      "['O' 'OCC1CCCO1']\n",
-      "564862\n",
-      "['O' 'OCC1CCCO1']\n",
-      "572405\n",
-      "['O' 'OCC1CCCO1']\n",
-      "573579\n",
-      "['O' 'OCC1CCCO1']\n",
-      "577100\n",
       "['OCC1CCCO1' None]\n"
      ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m solv \u001b[39m=\u001b[39m df[[\u001b[39m'\u001b[39m\u001b[39msolvent_000\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39msolvent_001\u001b[39m\u001b[39m'\u001b[39m]]\n\u001b[1;32m      2\u001b[0m \u001b[39m#iterate over all rows\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfor\u001b[39;00m index, row \u001b[39min\u001b[39;00m solv\u001b[39m.\u001b[39miterrows():\n\u001b[1;32m      4\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39mOCC1CCCO1\u001b[39m\u001b[39m'\u001b[39m \u001b[39min\u001b[39;00m row\u001b[39m.\u001b[39mvalues:\n\u001b[1;32m      5\u001b[0m         \u001b[39mprint\u001b[39m(index)\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/frame.py:1399\u001b[0m, in \u001b[0;36mDataFrame.iterrows\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1397\u001b[0m using_cow \u001b[39m=\u001b[39m using_copy_on_write()\n\u001b[1;32m   1398\u001b[0m \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalues):\n\u001b[0;32m-> 1399\u001b[0m     s \u001b[39m=\u001b[39m klass(v, index\u001b[39m=\u001b[39;49mcolumns, name\u001b[39m=\u001b[39;49mk)\u001b[39m.\u001b[39;49m__finalize__(\u001b[39mself\u001b[39;49m)\n\u001b[1;32m   1400\u001b[0m     \u001b[39mif\u001b[39;00m using_cow \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mgr\u001b[39m.\u001b[39mis_single_block:\n\u001b[1;32m   1401\u001b[0m         s\u001b[39m.\u001b[39m_mgr\u001b[39m.\u001b[39madd_references(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mgr)  \u001b[39m# type: ignore[arg-type]\u001b[39;00m\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/generic.py:5955\u001b[0m, in \u001b[0;36mNDFrame.__finalize__\u001b[0;34m(self, other, method, **kwargs)\u001b[0m\n\u001b[1;32m   5952\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m other\u001b[39m.\u001b[39mattrs:\n\u001b[1;32m   5953\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mattrs[name] \u001b[39m=\u001b[39m other\u001b[39m.\u001b[39mattrs[name]\n\u001b[0;32m-> 5955\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mflags\u001b[39m.\u001b[39mallows_duplicate_labels \u001b[39m=\u001b[39m other\u001b[39m.\u001b[39;49mflags\u001b[39m.\u001b[39;49mallows_duplicate_labels\n\u001b[1;32m   5956\u001b[0m \u001b[39m# For subclasses using _metadata.\u001b[39;00m\n\u001b[1;32m   5957\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m \u001b[39mset\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_metadata) \u001b[39m&\u001b[39m \u001b[39mset\u001b[39m(other\u001b[39m.\u001b[39m_metadata):\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/flags.py:53\u001b[0m, in \u001b[0;36mFlags.allows_duplicate_labels\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     50\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_allows_duplicate_labels \u001b[39m=\u001b[39m allows_duplicate_labels\n\u001b[1;32m     51\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_obj \u001b[39m=\u001b[39m weakref\u001b[39m.\u001b[39mref(obj)\n\u001b[0;32m---> 53\u001b[0m \u001b[39m@property\u001b[39m\n\u001b[1;32m     54\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mallows_duplicate_labels\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mbool\u001b[39m:\n\u001b[1;32m     55\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \u001b[39m    Whether this object allows duplicate labels.\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     81\u001b[0m \u001b[39m    a        [0, 1]\u001b[39;00m\n\u001b[1;32m     82\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m     83\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_allows_duplicate_labels\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
     }
    ],
    "source": [
@@ -261,12 +47,13 @@
     "for index, row in solv.iterrows():\n",
     "    if 'OCC1CCCO1' in row.values:\n",
     "        print(index)\n",
-    "        print(row.values)"
+    "        print(row.values)\n",
+    "        break"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -677,6 +464,100 @@
     "def get_intersection(df1, df2):\n",
     "    return df1.index.intersection(df2.index)"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Find reactions in test set (rare-> other) that don't appear in train set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df_path = \"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/orderly_v5/orderly_no_trust_no_map_test.parquet\"\n",
+    "train_df_path = \"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/orderly_v5/orderly_no_trust_with_map_train.parquet\"\n",
+    "test_df = pd.read_parquet(test_df_path)\n",
+    "train_df = pd.read_parquet(train_df_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "30422\n",
+      "33930\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Extract the values from the \"rxn_str\" column in train_df\n",
+    "train_rxn_str_values = train_df[\"rxn_str\"].values\n",
+    "\n",
+    "# Filter test_df based on values present in train_rxn_str_values\n",
+    "intersection_indices = test_df[test_df[\"rxn_str\"].isin(train_rxn_str_values)].index\n",
+    "\n",
+    "# Print the filtered rows\n",
+    "print(len(intersection_indices))\n",
+    "print(len(test_df))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3508"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df = test_df.drop(intersection_indices)\n",
+    "len(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#save the new test_df as parquet\n",
+    "test_df.to_parquet(\"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/intersection_test_set.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {