Skip to content

Commit

Permalink
Pickle to parquet (#41)
Browse files Browse the repository at this point in the history
* regen test data

* renaming and regen test data

* adding some imports

* adding load and save list func

* added test and regen test data
  • Loading branch information
Joearrowsmith authored Apr 14, 2023
1 parent 8f5cc20 commit cc69036
Show file tree
Hide file tree
Showing 94 changed files with 5,955 additions and 113 deletions.
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ FROM orderly_base as orderly_mypy_strict

CMD ["make", "strict_mypy"]

FROM orderly_base as orderly_gen_test_data

CMD ["make", "gen_test_data"]

FROM ubuntu:20.04 as orderly_download
RUN apt-get update && apt-get install -y make curl unzip

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ run_orderly_mypy_strict:
docker image build --target orderly_mypy_strict --tag orderly_mypy_strict .
docker run orderly_mypy_strict

run_orderly_gen_test_data:
docker image build --target orderly_gen_test_data --tag orderly_gen_test_data .
docker run -v $(current_dir)/orderly/data/:/home/worker/repo/orderly/data/ -u $(uid):$(gid) orderly_gen_test_data

linux_download_ord:
docker image build --target orderly_download_linux --tag orderly_download_linux .
docker run -v $(current_dir)/data:/tmp_data -u $(uid):$(gid) orderly_download_linux
Expand Down
16 changes: 8 additions & 8 deletions inspections.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4891,7 +4891,7 @@
"metadata": {},
"outputs": [],
"source": [
"def merge_pickles_mol_names():\n",
"def merge_extracted_ords_mol_names():\n",
" #create one big list of all the pickled names\n",
" folder_path = '/Users/dsw46/Projects/chemical-parameter-sharing/data/USPTO/molecule_names/'\n",
" onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
Expand Down Expand Up @@ -4919,7 +4919,7 @@
}
],
"source": [
"names_list = merge_pickles_mol_names()"
"names_list = merge_extracted_ords_mol_names()"
]
},
{
Expand Down Expand Up @@ -54537,9 +54537,9 @@
"outputs": [],
"source": [
"# Inspect pickled data\n",
"def merge_pickles():\n",
"def merge_extracted_ords():\n",
" #create one big df of all the pickled data\n",
" folder_path = 'data/USPTO/pickled_data/'\n",
" folder_path = 'data/USPTO/extracted_ords/'\n",
" onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
" full_df = pd.DataFrame()\n",
" for file in tqdm(onlyfiles[:100]):\n",
Expand All @@ -54565,7 +54565,7 @@
}
],
"source": [
"df = merge_pickles()"
"df = merge_extracted_ords()"
]
},
{
Expand Down Expand Up @@ -55268,9 +55268,9 @@
"metadata": {},
"outputs": [],
"source": [
"def merge_pickles():\n",
"def merge_extracted_ords():\n",
" #create one big df of all the pickled data\n",
" folder_path = 'data/USPTO/pickled_data/'\n",
" folder_path = 'data/USPTO/extracted_ords/'\n",
" onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
" full_df = pd.DataFrame()\n",
" for file in tqdm(onlyfiles):\n",
Expand All @@ -55296,7 +55296,7 @@
}
],
"source": [
"df = merge_pickles()"
"df = merge_extracted_ords()"
]
},
{
Expand Down
48 changes: 24 additions & 24 deletions orderly/clean/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

LOG = logging.getLogger(__name__)

import orderly.data.util


@dataclasses.dataclass(kw_only=True)
class Cleaner:
Expand Down Expand Up @@ -45,7 +47,7 @@ class Cleaner:
disable_tqdm (bool, optional): Controls the use of tqdm progress bar. Defaults to False.
"""

pickles_path: pathlib.Path
ord_extraction_path: pathlib.Path
consistent_yield: bool
num_reactant: int
num_product: int
Expand All @@ -64,24 +66,24 @@ class Cleaner:
def __post_init__(self) -> None:
self.cleaned_reactions = self._get_dataframe()

def _merge_pickles(self) -> pd.DataFrame:
def _merge_extracted_ords(self) -> pd.DataFrame:
# create one big df of all the extracted data

LOG.info("Getting merged dataframe from pickle files")
LOG.info("Getting merged dataframe from extracted ord files")

onlyfiles = [
f
for f in os.listdir(self.pickles_path)
if os.path.isfile(os.path.join(self.pickles_path, f))
for f in os.listdir(self.ord_extraction_path)
if os.path.isfile(os.path.join(self.ord_extraction_path, f))
]

dfs = []
with tqdm.contrib.logging.logging_redirect_tqdm(loggers=[LOG]):
for file in tqdm.tqdm(onlyfiles, disable=self.disable_tqdm):
if file[0] != ".": # We don't want to try to unpickle .DS_Store
filepath = self.pickles_path / file
unpickled_df = pd.read_pickle(filepath)
dfs.append(unpickled_df)
if file[0] != ".": # We don't want to try to read the .DS_Store
filepath = self.ord_extraction_path / file
extracted_df = pd.read_parquet(filepath)
dfs.append(extracted_df)
return pd.concat(dfs, ignore_index=True)

def _get_number_of_columns_to_keep(self) -> Dict[str, int]:
Expand Down Expand Up @@ -198,7 +200,7 @@ def _get_dataframe(self) -> pd.DataFrame:

LOG.info("Getting dataframe")

df = self._merge_pickles()
df = self._merge_extracted_ords()
LOG.info(f"All data length: {len(df)}")

# Remove reactions with too many of a certain component
Expand Down Expand Up @@ -314,16 +316,16 @@ def _get_dataframe(self) -> pd.DataFrame:
help="The filepath where the cleaned data will be saved",
)
@click.option(
"--pickles_path",
default="data/orderly/pickled_data",
"--ord_extraction_path",
default="data/orderly/extracted_ords",
type=str,
help="The filepath to the folder than contains the extracted pickles",
help="The filepath to the folder than contains the extracted ord data",
)
@click.option(
"--molecules_to_remove_path",
default="data/orderly/all_molecule_names.pkl",
default="data/orderly/all_molecule_names.csv",
type=str,
help="The path to the pickle file than contains the molecules_names",
help="The path to the file than contains the molecules_names",
)
@click.option(
"--consistent_yield",
Expand Down Expand Up @@ -405,7 +407,7 @@ def _get_dataframe(self) -> pd.DataFrame:
@click.option("--disable_tqdm", type=bool, default=False, show_default=True)
def main_click(
clean_data_path: pathlib.Path,
pickles_path: pathlib.Path,
ord_extraction_path: pathlib.Path,
molecules_to_remove_path: pathlib.Path,
consistent_yield: bool,
num_reactant: int,
Expand Down Expand Up @@ -445,7 +447,7 @@ def main_click(
"""
main(
clean_data_path=pathlib.Path(clean_data_path),
pickles_path=pathlib.Path(pickles_path),
ord_extraction_path=pathlib.Path(ord_extraction_path),
molecules_to_remove_path=pathlib.Path(molecules_to_remove_path),
consistent_yield=consistent_yield,
num_reactant=num_reactant,
Expand All @@ -465,7 +467,7 @@ def main_click(

def main(
clean_data_path: pathlib.Path,
pickles_path: pathlib.Path,
ord_extraction_path: pathlib.Path,
molecules_to_remove_path: pathlib.Path,
consistent_yield: bool,
num_reactant: int,
Expand Down Expand Up @@ -506,23 +508,21 @@ def main(

if not isinstance(clean_data_path, pathlib.Path):
raise ValueError(f"Expect pathlib.Path: got {type(clean_data_path)}")
if not isinstance(pickles_path, pathlib.Path):
raise ValueError(f"Expect pathlib.Path: got {type(pickles_path)}")
if not isinstance(ord_extraction_path, pathlib.Path):
raise ValueError(f"Expect pathlib.Path: got {type(ord_extraction_path)}")
if not isinstance(molecules_to_remove_path, pathlib.Path):
raise ValueError(f"Expect pathlib.Path: got {type(molecules_to_remove_path)}")

start_time = datetime.datetime.now()

molecules_to_remove = pd.read_pickle(
molecules_to_remove_path
) # reads in list of strs
molecules_to_remove = orderly.data.util.load_list(molecules_to_remove_path)

assert num_agent == 0 or (
num_cat == 0 and num_reag == 0
), "Invalid input: If trust_labelling=True in orderly.extract, then num_cat and num_reag must be 0. If trust_labelling=False, then num_agent must be 0."

instance = Cleaner(
pickles_path=pickles_path,
ord_extraction_path=ord_extraction_path,
consistent_yield=consistent_yield,
num_reactant=num_reactant,
num_product=num_product,
Expand Down
Loading

0 comments on commit cc69036

Please sign in to comment.