Deterministic random drop duplicates (#162)

* consider yield when drop duplicates if consistent yield * fix path * fix path * random deterministic drop duplicates * log how many rxn have multiple different yields * make black * fix mypy * starts with expects a tuple * make black * update make file
sustainable-processes · Jan 24, 2024 · 7189b7f · 7189b7f
1 parent c7d5763
commit 7189b7f
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 41 deletions.
diff --git a/Makefile b/Makefile
@@ -153,7 +153,7 @@ paper_clean_uspto_with_trust_unfiltered: #requires: paper_extract_uspto_with_tru
 
 paper_2: paper_clean_uspto_no_trust_unfiltered paper_clean_uspto_with_trust_unfiltered
 
-# 3. Plots
+# 3. Plots 
 
 paper_plot_uspto_no_trust_unfiltered_num_rxn_components: #requires: paper_clean_uspto_no_trust_unfiltered
 	python -m orderly.plot --clean_data_path="data/orderly/uspto_no_trust/unfiltered/unfiltered_orderly_ord.parquet" --plot_output_path="data/orderly/plot_no_trust/" --plot_num_rxn_components_bool=True --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=False
@@ -249,7 +249,7 @@ paper_8: fp_no_trust_no_map_test fp_no_trust_no_map_train fp_no_trust_with_map_t
 #Generate datasets for paper
 paper_get_datasets: paper_1 paper_6
 
-paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8
+paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6
 
 # 9. train models
 #Remember to switch env here (must contain TF, e.g. tf_mac_m1)
@@ -297,22 +297,23 @@ clean_orderly_condition:
 	pass #TODO
 
 clean_orderly_forward:
-	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
+	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
 
 clean_orderly_forward_non_uspto:
-	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
+	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto/extracted_ords" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
 
 clean_orderly_retro:
-	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
+	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
 
 clean_orderly_retro_non_uspto:
-	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
+	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto/extracted_ords" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
 
 clean_orderly_yield:
-	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
+	python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
 
-# ORDerly cond missing, not determined yet
-gen_all_benchmarks: extract_uspto extract_non_uspto clean_orderly_forward clean_orderly_forward_non_uspto clean_orderly_retro clean_orderly_retro_non_uspto clean_orderly_yield
+# ORDerly cond missing and orderly yield, not determined yet
+gen_all_benchmarks: clean_orderly_forward clean_orderly_forward_non_uspto clean_orderly_retro clean_orderly_retro_non_uspto clean_orderly_yield
+do_all_cleaning: paper_2 paper_3 paper_4 paper_5 paper_6 gen_all_benchmarks
 
 # Sweeps
 RANDOM_SEEDS = 12345 54321 98765

diff --git a/README.md b/README.md
@@ -171,16 +171,16 @@ If you would like to extract all data in ORD (instead of just USPTO data) simply
 
 ### ORDerly-forward
 
-```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
+```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
 
 ### ORDerly-retro
 
-```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
+```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
 
 
 ### ORDerly-yield
 
-```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
+```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
 
 ## Dataset from all non-USPTO data
 

diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py
@@ -764,25 +764,59 @@ def _get_dataframe(self) -> pd.DataFrame:
                 f"After removing reactions with inconsistent yields: {df.shape[0]}"
             )
 
-        # drop duplicates
-        if self.drop_duplicates:
-            col_subset = [
-                col
-                for col in df.columns
-                if col.startswith(
-                    ("reactant", "product", "solvent", "reagent", "agent", "catalyst")
-                )
+        def get_columns_for_duplicate_checking(
+            df: pd.DataFrame, consistent_yield: bool
+        ) -> List[str]:
+            """Get the columns to check for duplicates"""
+            if consistent_yield:
+                columns = [
+                    "reactant",
+                    "product",
+                    "solvent",
+                    "reagent",
+                    "agent",
+                    "catalyst",
+                    "yield",
+                ]
+            else:
+                columns = [
+                    "reactant",
+                    "product",
+                    "solvent",
+                    "reagent",
+                    "agent",
+                    "catalyst",
+                ]
+
+            columns_to_check = [
+                col for col in df.columns if col.startswith(tuple(columns))
             ]
-            LOG.info(
-                f"Before removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
-            )
-            df = df.drop_duplicates(subset=col_subset, keep="first")
-            LOG.info(
-                f"After removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
-            )
+            return columns_to_check
+
+        # Rearrange the row order of the df randomly, but deterministically, so it's a random rxn that get's dropped, not the oldest (since the oldest rxns are at the top of the df)
+        # Setting a seed for reproducibility
+        np.random.seed(12345)
+
+        # Assign a random number to each row
+        df["random"] = np.random.rand(len(df))
+
+        # Sort by the random number
+        df.sort_values("random", inplace=True)
 
         # Remove reactions with rare molecules
         if self.min_frequency_of_occurrence != 0:  # We need to check for rare molecules
+            # drop duplicates
+            if self.drop_duplicates:
+                col_subset = get_columns_for_duplicate_checking(
+                    df, self.consistent_yield
+                )
+                LOG.info(
+                    f"Before removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
+                )
+                df = df.drop_duplicates(subset=col_subset, keep="first")
+                LOG.info(
+                    f"After removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
+                )
             # Define the list of columns to check
 
             columns_to_count_from = self._get_columns_beginning_with_str(
@@ -810,22 +844,29 @@ def _get_dataframe(self) -> pd.DataFrame:
 
         # drop duplicates deals with any final duplicates from mapping rares to other
         if self.drop_duplicates:
-            col_subset = [
-                col
-                for col in df.columns
-                if col.startswith(
-                    ("reactant", "product", "solvent", "reagent", "agent", "catalyst")
-                )
-            ]
+            col_subset = col_subset = get_columns_for_duplicate_checking(
+                df, self.consistent_yield
+            )
             LOG.info(
-                f"Before removing duplicates (after map_to_other) ({col_subset=}): {df.shape[0]}"
+                f"Before removing duplicates (after map_to_other, if applicable) ({col_subset=}): {df.shape[0]}"
             )
             df = df.drop_duplicates(subset=col_subset, keep="first")
             LOG.info(
-                f"After removing duplicates (after map_to_other) ({col_subset=}): {df.shape[0]}"
+                f"After removing duplicates (after map_to_other, if applicable) ({col_subset=}): {df.shape[0]}"
             )
+            # Track how many reactions have multiple different yields
+            if self.consistent_yield:
+                secondary_df = df.copy()
+                secondary_col_subset = get_columns_for_duplicate_checking(
+                    secondary_df, not self.consistent_yield
+                )
+                secondary_df.drop_duplicates(subset=secondary_col_subset, keep="first")
+                LOG.info(
+                    f"Total number of reactions: {df.shape[0]}. Reactions with multiple yields: {df.shape[0] - secondary_df.shape[0]}"
+                )
 
-        df.reset_index(inplace=True, drop=True)
+        df.drop("random", axis=1, inplace=True)
+        df.reset_index(drop=True, inplace=True)
 
         if self.scramble:
             components = ("reactant", "product", "solvent", "catalyst", "reagent")
@@ -872,7 +913,7 @@ def get_matching_indices(
     # Need to fillna with "NULL" so that the matching works
     for col in reactant_columns + product_columns:
         df[col] = df[col].fillna("NULL")
-    
+
     # Get reaction 'hashes'
     reaction_hashes = [
         ".".join(
@@ -1355,7 +1396,11 @@ def main(
         df_for_matching = df.copy()
 
         matching_indices = get_matching_indices(
-            df_for_matching, train_indices, test_indices, reactant_columns, product_columns
+            df_for_matching,
+            train_indices,
+            test_indices,
+            reactant_columns,
+            product_columns,
         )
 
         # drop the matching rows from the test set

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -891,10 +891,10 @@ def test_get_cleaned_df(
 
     cleaned_df, _ = copy.copy(cleaned_df_params_default)
     assert not cleaned_df.empty
-    
+
     # check that "NULL" is not in the dataframe
     assert not "NULL" in cleaned_df.values
-    
+
     # TODO: check that there's only NaN or NaT, but no None