Skip to content

Commit

Permalink
Test set intersect (#138)
Browse files Browse the repository at this point in the history
* finding intersection between different test sets

* update num reactants=5 and min_freq=100
  • Loading branch information
dswigh authored Jun 9, 2023
1 parent 9f97292 commit 8eac53a
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 221 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,10 @@ paper_gen_uspto_no_trust_with_map: #requires: paper_extract_uspto_no_trust
python -m orderly.clean --output_path="data/orderly/datasets/orderly_no_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9

paper_gen_uspto_with_trust_with_map: #requires: paper_extract_uspto_with_trust
python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=10 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=5 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_with_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9

paper_gen_uspto_with_trust_no_map: #requires: paper_extract_uspto_with_trust
python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=10 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=5 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9
python -m orderly.clean --output_path="data/orderly/datasets/orderly_with_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9

paper_6: paper_gen_uspto_no_trust_no_map paper_gen_uspto_no_trust_with_map paper_gen_uspto_with_trust_with_map paper_gen_uspto_with_trust_no_map

Expand Down
319 changes: 100 additions & 219 deletions notebooks/fast_test_train_intersection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -29,230 +29,16 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"34307\n",
"['OCC1CCCO1' None]\n",
"58860\n",
"['OCC1CCCO1' None]\n",
"62541\n",
"['OCC1CCCO1' None]\n",
"68327\n",
"['OCC1CCCO1' None]\n",
"68329\n",
"['OCC1CCCO1' None]\n",
"68652\n",
"['OCC1CCCO1' None]\n",
"68654\n",
"['OCC1CCCO1' None]\n",
"74150\n",
"['CCN(CC)CC' 'OCC1CCCO1']\n",
"101631\n",
"['OCC1CCCO1' None]\n",
"110332\n",
"['OCC1CCCO1' None]\n",
"114634\n",
"['OCC1CCCO1' None]\n",
"127463\n",
"['OCC1CCCO1' None]\n",
"128001\n",
"['OCC1CCCO1' None]\n",
"128013\n",
"['OCC1CCCO1' None]\n",
"128499\n",
"['OCC1CCCO1' None]\n",
"132968\n",
"['OCC1CCCO1' None]\n",
"137430\n",
"['OCC1CCCO1' None]\n",
"141937\n",
"['OCC1CCCO1' None]\n",
"143416\n",
"['OCC1CCCO1' None]\n",
"148328\n",
"['OCC1CCCO1' None]\n",
"149017\n",
"['OCC1CCCO1' None]\n",
"149827\n",
"['OCC1CCCO1' None]\n",
"149828\n",
"['OCC1CCCO1' None]\n",
"151308\n",
"['OCC1CCCO1' None]\n",
"154726\n",
"['OCC1CCCO1' None]\n",
"161780\n",
"['OCC1CCCO1' None]\n",
"187387\n",
"['O' 'OCC1CCCO1']\n",
"194731\n",
"['OCC1CCCO1' None]\n",
"195695\n",
"['O' 'OCC1CCCO1']\n",
"203215\n",
"['O' 'OCC1CCCO1']\n",
"204637\n",
"['O=CO' 'OCC1CCCO1']\n",
"210945\n",
"['O' 'OCC1CCCO1']\n",
"210946\n",
"['O' 'OCC1CCCO1']\n",
"210948\n",
"['O' 'OCC1CCCO1']\n",
"210949\n",
"['O' 'OCC1CCCO1']\n",
"224183\n",
"['OCC1CCCO1' None]\n",
"228080\n",
"['OCC1CCCO1' None]\n",
"229343\n",
"['OCC1CCCO1' None]\n",
"233711\n",
"['OCC1CCCO1' None]\n",
"245055\n",
"['O' 'OCC1CCCO1']\n",
"245056\n",
"['O' 'OCC1CCCO1']\n",
"245058\n",
"['O' 'OCC1CCCO1']\n",
"245059\n",
"['O' 'OCC1CCCO1']\n",
"247782\n",
"['OCC1CCCO1' None]\n",
"259764\n",
"['OCC1CCCO1' None]\n",
"264401\n",
"['OCC1CCCO1' None]\n",
"267955\n",
"['OCC1CCCO1' None]\n",
"268185\n",
"['O' 'OCC1CCCO1']\n",
"268584\n",
"['OCC1CCCO1' None]\n",
"278882\n",
"['OCC1CCCO1' None]\n",
"281471\n",
"['OCC1CCCO1' None]\n",
"282835\n",
"['CCOC(C)=O' 'OCC1CCCO1']\n",
"282886\n",
"['OCC1CCCO1' None]\n",
"282894\n",
"['OCC1CCCO1' None]\n",
"282901\n",
"['OCC1CCCO1' None]\n",
"283847\n",
"['OCC1CCCO1' None]\n",
"287862\n",
"['OCC1CCCO1' None]\n",
"293441\n",
"['OCC1CCCO1' None]\n",
"300382\n",
"['O' 'OCC1CCCO1']\n",
"300397\n",
"['O' 'OCC1CCCO1']\n",
"300411\n",
"['O' 'OCC1CCCO1']\n",
"311473\n",
"['OCC1CCCO1' None]\n",
"312713\n",
"['O' 'OCC1CCCO1']\n",
"312727\n",
"['O' 'OCC1CCCO1']\n",
"338143\n",
"['OCC1CCCO1' None]\n",
"339244\n",
"['OCC1CCCO1' None]\n",
"339862\n",
"['O' 'OCC1CCCO1']\n",
"351248\n",
"['O' 'OCC1CCCO1']\n",
"351249\n",
"['OCC1CCCO1' None]\n",
"353752\n",
"['O' 'OCC1CCCO1']\n",
"354705\n",
"['OCC1CCCO1' None]\n",
"372195\n",
"['O' 'OCC1CCCO1']\n",
"377535\n",
"['O' 'OCC1CCCO1']\n",
"382436\n",
"['CCOC(C)=O' 'OCC1CCCO1']\n",
"401475\n",
"['O' 'OCC1CCCO1']\n",
"403912\n",
"['OCC1CCCO1' None]\n",
"408393\n",
"['O' 'OCC1CCCO1']\n",
"414196\n",
"['CCOC(C)=O' 'OCC1CCCO1']\n",
"415182\n",
"['OCC1CCCO1' None]\n",
"415215\n",
"['OCC1CCCO1' None]\n",
"454790\n",
"['O' 'OCC1CCCO1']\n",
"458851\n",
"['O' 'OCC1CCCO1']\n",
"466098\n",
"['O' 'OCC1CCCO1']\n",
"473298\n",
"['OCC1CCCO1' None]\n",
"473308\n",
"['OCC1CCCO1' None]\n",
"480182\n",
"['O' 'OCC1CCCO1']\n",
"501472\n",
"['OCC1CCCO1' None]\n",
"508052\n",
"['OCC1CCCO1' None]\n",
"510500\n",
"['OCC1CCCO1' None]\n",
"520416\n",
"['OCC1CCCO1' None]\n",
"539132\n",
"['O' 'OCC1CCCO1']\n",
"539217\n",
"['OCC1CCCO1' None]\n",
"542065\n",
"['OCC1CCCO1' None]\n",
"542072\n",
"['OCC1CCCO1' None]\n",
"547433\n",
"['OCC1CCCO1' None]\n",
"547854\n",
"['OCC1CCCO1' None]\n",
"561913\n",
"['O' 'OCC1CCCO1']\n",
"564862\n",
"['O' 'OCC1CCCO1']\n",
"572405\n",
"['O' 'OCC1CCCO1']\n",
"573579\n",
"['O' 'OCC1CCCO1']\n",
"577100\n",
"['OCC1CCCO1' None]\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m solv \u001b[39m=\u001b[39m df[[\u001b[39m'\u001b[39m\u001b[39msolvent_000\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39msolvent_001\u001b[39m\u001b[39m'\u001b[39m]]\n\u001b[1;32m 2\u001b[0m \u001b[39m#iterate over all rows\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfor\u001b[39;00m index, row \u001b[39min\u001b[39;00m solv\u001b[39m.\u001b[39miterrows():\n\u001b[1;32m 4\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39mOCC1CCCO1\u001b[39m\u001b[39m'\u001b[39m \u001b[39min\u001b[39;00m row\u001b[39m.\u001b[39mvalues:\n\u001b[1;32m 5\u001b[0m \u001b[39mprint\u001b[39m(index)\n",
"File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/frame.py:1399\u001b[0m, in \u001b[0;36mDataFrame.iterrows\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1397\u001b[0m using_cow \u001b[39m=\u001b[39m using_copy_on_write()\n\u001b[1;32m 1398\u001b[0m \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvalues):\n\u001b[0;32m-> 1399\u001b[0m s \u001b[39m=\u001b[39m klass(v, index\u001b[39m=\u001b[39;49mcolumns, name\u001b[39m=\u001b[39;49mk)\u001b[39m.\u001b[39;49m__finalize__(\u001b[39mself\u001b[39;49m)\n\u001b[1;32m 1400\u001b[0m \u001b[39mif\u001b[39;00m using_cow \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mgr\u001b[39m.\u001b[39mis_single_block:\n\u001b[1;32m 1401\u001b[0m s\u001b[39m.\u001b[39m_mgr\u001b[39m.\u001b[39madd_references(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mgr) \u001b[39m# type: ignore[arg-type]\u001b[39;00m\n",
"File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/generic.py:5955\u001b[0m, in \u001b[0;36mNDFrame.__finalize__\u001b[0;34m(self, other, method, **kwargs)\u001b[0m\n\u001b[1;32m 5952\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m other\u001b[39m.\u001b[39mattrs:\n\u001b[1;32m 5953\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mattrs[name] \u001b[39m=\u001b[39m other\u001b[39m.\u001b[39mattrs[name]\n\u001b[0;32m-> 5955\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mflags\u001b[39m.\u001b[39mallows_duplicate_labels \u001b[39m=\u001b[39m other\u001b[39m.\u001b[39;49mflags\u001b[39m.\u001b[39;49mallows_duplicate_labels\n\u001b[1;32m 5956\u001b[0m \u001b[39m# For subclasses using _metadata.\u001b[39;00m\n\u001b[1;32m 5957\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m \u001b[39mset\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_metadata) \u001b[39m&\u001b[39m \u001b[39mset\u001b[39m(other\u001b[39m.\u001b[39m_metadata):\n",
"File \u001b[0;32m~/opt/anaconda3/envs/chemistry/lib/python3.10/site-packages/pandas/core/flags.py:53\u001b[0m, in \u001b[0;36mFlags.allows_duplicate_labels\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_allows_duplicate_labels \u001b[39m=\u001b[39m allows_duplicate_labels\n\u001b[1;32m 51\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_obj \u001b[39m=\u001b[39m weakref\u001b[39m.\u001b[39mref(obj)\n\u001b[0;32m---> 53\u001b[0m \u001b[39m@property\u001b[39m\n\u001b[1;32m 54\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mallows_duplicate_labels\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mbool\u001b[39m:\n\u001b[1;32m 55\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[39m Whether this object allows duplicate labels.\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[39m a [0, 1]\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m 83\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_allows_duplicate_labels\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
Expand All @@ -261,12 +47,13 @@
"for index, row in solv.iterrows():\n",
" if 'OCC1CCCO1' in row.values:\n",
" print(index)\n",
" print(row.values)"
" print(row.values)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -677,6 +464,100 @@
"def get_intersection(df1, df2):\n",
" return df1.index.intersection(df2.index)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Find reactions in test set (rare-> other) that don't appear in train set"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"test_df_path = \"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/orderly_v5/orderly_no_trust_no_map_test.parquet\"\n",
"train_df_path = \"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/orderly_v5/orderly_no_trust_with_map_train.parquet\"\n",
"test_df = pd.read_parquet(test_df_path)\n",
"train_df = pd.read_parquet(train_df_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30422\n",
"33930\n"
]
}
],
"source": [
"\n",
"# Extract the values from the \"rxn_str\" column in train_df\n",
"train_rxn_str_values = train_df[\"rxn_str\"].values\n",
"\n",
"# Filter test_df based on values present in train_rxn_str_values\n",
"intersection_indices = test_df[test_df[\"rxn_str\"].isin(train_rxn_str_values)].index\n",
"\n",
"# Print the filtered rows\n",
"print(len(intersection_indices))\n",
"print(len(test_df))\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3508"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = test_df.drop(intersection_indices)\n",
"len(test_df)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#save the new test_df as parquet\n",
"test_df.to_parquet(\"/Users/dsw46/Library/CloudStorage/OneDrive-UniversityofCambridge/Datasets/intersection_test_set.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 8eac53a

Please sign in to comment.