Pickle to parquet (#41)

* regen test data * renaming and regen test data * adding some imports * adding load and save list func * added test and regen test data
sustainable-processes · Apr 14, 2023 · cc69036 · cc69036
1 parent 8f5cc20
commit cc69036
Show file tree

Hide file tree

Showing 94 changed files with 5,955 additions and 113 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -75,6 +75,10 @@ FROM orderly_base as orderly_mypy_strict
 
 CMD ["make", "strict_mypy"]
 
+FROM orderly_base as orderly_gen_test_data
+
+CMD ["make", "gen_test_data"]
+
 FROM ubuntu:20.04 as orderly_download
 RUN apt-get update && apt-get install -y make curl unzip
 

diff --git a/Makefile b/Makefile
@@ -51,6 +51,10 @@ run_orderly_mypy_strict:
 	docker image build --target orderly_mypy_strict --tag orderly_mypy_strict .
 	docker run orderly_mypy_strict
 
+run_orderly_gen_test_data:
+	docker image build --target orderly_gen_test_data --tag orderly_gen_test_data .
+	docker run -v $(current_dir)/orderly/data/:/home/worker/repo/orderly/data/ -u $(uid):$(gid) orderly_gen_test_data
+
 linux_download_ord:
 	docker image build --target orderly_download_linux --tag orderly_download_linux .
 	docker run -v $(current_dir)/data:/tmp_data -u $(uid):$(gid) orderly_download_linux

diff --git a/inspections.ipynb b/inspections.ipynb
@@ -4891,7 +4891,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def merge_pickles_mol_names():\n",
+    "def merge_extracted_ords_mol_names():\n",
     "    #create one big list of all the pickled names\n",
     "    folder_path = '/Users/dsw46/Projects/chemical-parameter-sharing/data/USPTO/molecule_names/'\n",
     "    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
@@ -4919,7 +4919,7 @@
     }
    ],
    "source": [
-    "names_list = merge_pickles_mol_names()"
+    "names_list = merge_extracted_ords_mol_names()"
    ]
   },
   {
@@ -54537,9 +54537,9 @@
    "outputs": [],
    "source": [
     "# Inspect pickled data\n",
-    "def merge_pickles():\n",
+    "def merge_extracted_ords():\n",
     "    #create one big df of all the pickled data\n",
-    "    folder_path = 'data/USPTO/pickled_data/'\n",
+    "    folder_path = 'data/USPTO/extracted_ords/'\n",
     "    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
     "    full_df = pd.DataFrame()\n",
     "    for file in tqdm(onlyfiles[:100]):\n",
@@ -54565,7 +54565,7 @@
     }
    ],
    "source": [
-    "df = merge_pickles()"
+    "df = merge_extracted_ords()"
    ]
   },
   {
@@ -55268,9 +55268,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def merge_pickles():\n",
+    "def merge_extracted_ords():\n",
     "    #create one big df of all the pickled data\n",
-    "    folder_path = 'data/USPTO/pickled_data/'\n",
+    "    folder_path = 'data/USPTO/extracted_ords/'\n",
     "    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]\n",
     "    full_df = pd.DataFrame()\n",
     "    for file in tqdm(onlyfiles):\n",
@@ -55296,7 +55296,7 @@
     }
    ],
    "source": [
-    "df = merge_pickles()"
+    "df = merge_extracted_ords()"
    ]
   },
   {

diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py
@@ -12,6 +12,8 @@
 
 LOG = logging.getLogger(__name__)
 
+import orderly.data.util
+
 
 @dataclasses.dataclass(kw_only=True)
 class Cleaner:
@@ -45,7 +47,7 @@ class Cleaner:
         disable_tqdm (bool, optional): Controls the use of tqdm progress bar. Defaults to False.
     """
 
-    pickles_path: pathlib.Path
+    ord_extraction_path: pathlib.Path
     consistent_yield: bool
     num_reactant: int
     num_product: int
@@ -64,24 +66,24 @@ class Cleaner:
     def __post_init__(self) -> None:
         self.cleaned_reactions = self._get_dataframe()
 
-    def _merge_pickles(self) -> pd.DataFrame:
+    def _merge_extracted_ords(self) -> pd.DataFrame:
         # create one big df of all the extracted data
 
-        LOG.info("Getting merged dataframe from pickle files")
+        LOG.info("Getting merged dataframe from extracted ord files")
 
         onlyfiles = [
             f
-            for f in os.listdir(self.pickles_path)
-            if os.path.isfile(os.path.join(self.pickles_path, f))
+            for f in os.listdir(self.ord_extraction_path)
+            if os.path.isfile(os.path.join(self.ord_extraction_path, f))
         ]
 
         dfs = []
         with tqdm.contrib.logging.logging_redirect_tqdm(loggers=[LOG]):
             for file in tqdm.tqdm(onlyfiles, disable=self.disable_tqdm):
-                if file[0] != ".":  # We don't want to try to unpickle .DS_Store
-                    filepath = self.pickles_path / file
-                    unpickled_df = pd.read_pickle(filepath)
-                    dfs.append(unpickled_df)
+                if file[0] != ".":  # We don't want to try to read the .DS_Store
+                    filepath = self.ord_extraction_path / file
+                    extracted_df = pd.read_parquet(filepath)
+                    dfs.append(extracted_df)
         return pd.concat(dfs, ignore_index=True)
 
     def _get_number_of_columns_to_keep(self) -> Dict[str, int]:
@@ -198,7 +200,7 @@ def _get_dataframe(self) -> pd.DataFrame:
 
         LOG.info("Getting dataframe")
 
-        df = self._merge_pickles()
+        df = self._merge_extracted_ords()
         LOG.info(f"All data length: {len(df)}")
 
         # Remove reactions with too many of a certain component
@@ -314,16 +316,16 @@ def _get_dataframe(self) -> pd.DataFrame:
     help="The filepath where the cleaned data will be saved",
 )
 @click.option(
-    "--pickles_path",
-    default="data/orderly/pickled_data",
+    "--ord_extraction_path",
+    default="data/orderly/extracted_ords",
     type=str,
-    help="The filepath to the folder than contains the extracted pickles",
+    help="The filepath to the folder than contains the extracted ord data",
 )
 @click.option(
     "--molecules_to_remove_path",
-    default="data/orderly/all_molecule_names.pkl",
+    default="data/orderly/all_molecule_names.csv",
     type=str,
-    help="The path to the pickle file than contains the molecules_names",
+    help="The path to the file than contains the molecules_names",
 )
 @click.option(
     "--consistent_yield",
@@ -405,7 +407,7 @@ def _get_dataframe(self) -> pd.DataFrame:
 @click.option("--disable_tqdm", type=bool, default=False, show_default=True)
 def main_click(
     clean_data_path: pathlib.Path,
-    pickles_path: pathlib.Path,
+    ord_extraction_path: pathlib.Path,
     molecules_to_remove_path: pathlib.Path,
     consistent_yield: bool,
     num_reactant: int,
@@ -445,7 +447,7 @@ def main_click(
     """
     main(
         clean_data_path=pathlib.Path(clean_data_path),
-        pickles_path=pathlib.Path(pickles_path),
+        ord_extraction_path=pathlib.Path(ord_extraction_path),
         molecules_to_remove_path=pathlib.Path(molecules_to_remove_path),
         consistent_yield=consistent_yield,
         num_reactant=num_reactant,
@@ -465,7 +467,7 @@ def main_click(
 
 def main(
     clean_data_path: pathlib.Path,
-    pickles_path: pathlib.Path,
+    ord_extraction_path: pathlib.Path,
     molecules_to_remove_path: pathlib.Path,
     consistent_yield: bool,
     num_reactant: int,
@@ -506,23 +508,21 @@ def main(
 
     if not isinstance(clean_data_path, pathlib.Path):
         raise ValueError(f"Expect pathlib.Path: got {type(clean_data_path)}")
-    if not isinstance(pickles_path, pathlib.Path):
-        raise ValueError(f"Expect pathlib.Path: got {type(pickles_path)}")
+    if not isinstance(ord_extraction_path, pathlib.Path):
+        raise ValueError(f"Expect pathlib.Path: got {type(ord_extraction_path)}")
     if not isinstance(molecules_to_remove_path, pathlib.Path):
         raise ValueError(f"Expect pathlib.Path: got {type(molecules_to_remove_path)}")
 
     start_time = datetime.datetime.now()
 
-    molecules_to_remove = pd.read_pickle(
-        molecules_to_remove_path
-    )  # reads in list of strs
+    molecules_to_remove = orderly.data.util.load_list(molecules_to_remove_path)
 
     assert num_agent == 0 or (
         num_cat == 0 and num_reag == 0
     ), "Invalid input: If trust_labelling=True in orderly.extract, then num_cat and num_reag must be 0. If trust_labelling=False, then num_agent must be 0."
 
     instance = Cleaner(
-        pickles_path=pickles_path,
+        ord_extraction_path=ord_extraction_path,
         consistent_yield=consistent_yield,
         num_reactant=num_reactant,
         num_product=num_product,