Skip to content

Commit

Permalink
Deterministic random drop duplicates (#162)
Browse files Browse the repository at this point in the history
* consider yield when drop duplicates if consistent yield

* fix path

* fix path

* random deterministic drop duplicates

* log how many rxn have multiple different yields

* make black

* fix mypy

* starts with expects a tuple

* make black

* update make file
  • Loading branch information
dswigh authored Jan 24, 2024
1 parent c7d5763 commit 7189b7f
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 41 deletions.
19 changes: 10 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ paper_clean_uspto_with_trust_unfiltered: #requires: paper_extract_uspto_with_tru

paper_2: paper_clean_uspto_no_trust_unfiltered paper_clean_uspto_with_trust_unfiltered

# 3. Plots
# 3. Plots

paper_plot_uspto_no_trust_unfiltered_num_rxn_components: #requires: paper_clean_uspto_no_trust_unfiltered
python -m orderly.plot --clean_data_path="data/orderly/uspto_no_trust/unfiltered/unfiltered_orderly_ord.parquet" --plot_output_path="data/orderly/plot_no_trust/" --plot_num_rxn_components_bool=True --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=False
Expand Down Expand Up @@ -249,7 +249,7 @@ paper_8: fp_no_trust_no_map_test fp_no_trust_no_map_train fp_no_trust_with_map_t
#Generate datasets for paper
paper_get_datasets: paper_1 paper_6

paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8
paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6

# 9. train models
#Remember to switch env here (must contain TF, e.g. tf_mac_m1)
Expand Down Expand Up @@ -297,22 +297,23 @@ clean_orderly_condition:
pass #TODO

clean_orderly_forward:
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False

clean_orderly_forward_non_uspto:
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_forward_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto/extracted_ords" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False

clean_orderly_retro:
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False

clean_orderly_retro_non_uspto:
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_retro_non_uspto.parquet" --ord_extraction_path="data/orderly/non_uspto/extracted_ords" --molecules_to_remove_path="data/orderly/non_uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False

clean_orderly_yield:
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False
python -m orderly.clean --output_path="data/orderly/orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False

# ORDerly cond missing, not determined yet
gen_all_benchmarks: extract_uspto extract_non_uspto clean_orderly_forward clean_orderly_forward_non_uspto clean_orderly_retro clean_orderly_retro_non_uspto clean_orderly_yield
# ORDerly cond missing and orderly yield, not determined yet
gen_all_benchmarks: clean_orderly_forward clean_orderly_forward_non_uspto clean_orderly_retro clean_orderly_retro_non_uspto clean_orderly_yield
do_all_cleaning: paper_2 paper_3 paper_4 paper_5 paper_6 gen_all_benchmarks

# Sweeps
RANDOM_SEEDS = 12345 54321 98765
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,16 @@ If you would like to extract all data in ORD (instead of just USPTO data) simply

### ORDerly-forward

```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_forward.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=2 --num_reactant=3 --num_solv=3 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```

### ORDerly-retro

```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_retro.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```


### ORDerly-yield

```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```
```python -m orderly.clean --output_path="../orderly_benchmarks/orderly_yield.parquet" --ord_extraction_path="data/orderly/uspto/extracted_ords" --molecules_to_remove_path="data/orderly/uspto/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --num_product=1 --num_reactant=2 --num_solv=-1 --num_agent=-1 --num_cat=0 --num_reag=0 --consistent_yield=True --scramble=True --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False```

## Dataset from all non-USPTO data

Expand Down
99 changes: 72 additions & 27 deletions orderly/clean/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,25 +764,59 @@ def _get_dataframe(self) -> pd.DataFrame:
f"After removing reactions with inconsistent yields: {df.shape[0]}"
)

# drop duplicates
if self.drop_duplicates:
col_subset = [
col
for col in df.columns
if col.startswith(
("reactant", "product", "solvent", "reagent", "agent", "catalyst")
)
def get_columns_for_duplicate_checking(
df: pd.DataFrame, consistent_yield: bool
) -> List[str]:
"""Get the columns to check for duplicates"""
if consistent_yield:
columns = [
"reactant",
"product",
"solvent",
"reagent",
"agent",
"catalyst",
"yield",
]
else:
columns = [
"reactant",
"product",
"solvent",
"reagent",
"agent",
"catalyst",
]

columns_to_check = [
col for col in df.columns if col.startswith(tuple(columns))
]
LOG.info(
f"Before removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
)
df = df.drop_duplicates(subset=col_subset, keep="first")
LOG.info(
f"After removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
)
return columns_to_check

# Rearrange the row order of the df randomly, but deterministically, so it's a random rxn that get's dropped, not the oldest (since the oldest rxns are at the top of the df)
# Setting a seed for reproducibility
np.random.seed(12345)

# Assign a random number to each row
df["random"] = np.random.rand(len(df))

# Sort by the random number
df.sort_values("random", inplace=True)

# Remove reactions with rare molecules
if self.min_frequency_of_occurrence != 0: # We need to check for rare molecules
# drop duplicates
if self.drop_duplicates:
col_subset = get_columns_for_duplicate_checking(
df, self.consistent_yield
)
LOG.info(
f"Before removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
)
df = df.drop_duplicates(subset=col_subset, keep="first")
LOG.info(
f"After removing duplicates (before map_to_other) ({col_subset=}): {df.shape[0]}"
)
# Define the list of columns to check

columns_to_count_from = self._get_columns_beginning_with_str(
Expand Down Expand Up @@ -810,22 +844,29 @@ def _get_dataframe(self) -> pd.DataFrame:

# drop duplicates deals with any final duplicates from mapping rares to other
if self.drop_duplicates:
col_subset = [
col
for col in df.columns
if col.startswith(
("reactant", "product", "solvent", "reagent", "agent", "catalyst")
)
]
col_subset = col_subset = get_columns_for_duplicate_checking(
df, self.consistent_yield
)
LOG.info(
f"Before removing duplicates (after map_to_other) ({col_subset=}): {df.shape[0]}"
f"Before removing duplicates (after map_to_other, if applicable) ({col_subset=}): {df.shape[0]}"
)
df = df.drop_duplicates(subset=col_subset, keep="first")
LOG.info(
f"After removing duplicates (after map_to_other) ({col_subset=}): {df.shape[0]}"
f"After removing duplicates (after map_to_other, if applicable) ({col_subset=}): {df.shape[0]}"
)
# Track how many reactions have multiple different yields
if self.consistent_yield:
secondary_df = df.copy()
secondary_col_subset = get_columns_for_duplicate_checking(
secondary_df, not self.consistent_yield
)
secondary_df.drop_duplicates(subset=secondary_col_subset, keep="first")
LOG.info(
f"Total number of reactions: {df.shape[0]}. Reactions with multiple yields: {df.shape[0] - secondary_df.shape[0]}"
)

df.reset_index(inplace=True, drop=True)
df.drop("random", axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

if self.scramble:
components = ("reactant", "product", "solvent", "catalyst", "reagent")
Expand Down Expand Up @@ -872,7 +913,7 @@ def get_matching_indices(
# Need to fillna with "NULL" so that the matching works
for col in reactant_columns + product_columns:
df[col] = df[col].fillna("NULL")

# Get reaction 'hashes'
reaction_hashes = [
".".join(
Expand Down Expand Up @@ -1355,7 +1396,11 @@ def main(
df_for_matching = df.copy()

matching_indices = get_matching_indices(
df_for_matching, train_indices, test_indices, reactant_columns, product_columns
df_for_matching,
train_indices,
test_indices,
reactant_columns,
product_columns,
)

# drop the matching rows from the test set
Expand Down
4 changes: 2 additions & 2 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,10 +891,10 @@ def test_get_cleaned_df(

cleaned_df, _ = copy.copy(cleaned_df_params_default)
assert not cleaned_df.empty

# check that "NULL" is not in the dataframe
assert not "NULL" in cleaned_df.values

# TODO: check that there's only NaN or NaT, but no None


Expand Down

0 comments on commit 7189b7f

Please sign in to comment.