Get training working (#128)

* Verbosity flag * Log accuracy of frequency informed guess * Fix mypy * Add type * Update gitignore * Add reduce lr on plateau * Fix mypy * Make sure best model is loaded for evaluation * Update batch size for experiments * Add commands for sweeps * Update commands * Update commands filepath * Add interleave cli command * UPdate cpu profiling * Changes to get model working * Add jsonify dict * Add reduce lr on plateau * Fix mypy * Make sure best model is loaded for evaluation * Update batch size for experiments * Add commands for sweeps * Update commands * Update commands filepath * Add interleave cli command * UPdate cpu profiling * Changes to get model working * Add jsonify dict * Add val metrics * Mypy and linting * Linting * Setup sweep * Fix sweep * Update evaluation code * Run training * Fix fingerprint indices * Add assert statement to make sure df and fp are same size * Make sure train_fraction is taken on the training set only * Change the frequency informed guess to be teh training set instead of the training and validation set * Change seed * Linting
sustainable-processes · Jun 5, 2023 · 99dbf8e · 99dbf8e
1 parent a22d8e5
commit 99dbf8e
Show file tree

Hide file tree

Showing 14 changed files with 579 additions and 133 deletions.
diff --git a/Makefile b/Makefile
@@ -312,19 +312,34 @@ with_trust_no_map_train_20:
 with_trust_with_map_train_20:
 	python -m condition_prediction --train_data_path="data/orderly/datasets/orderly_with_trust_with_map_train.parquet" --test_data_path="data/orderly/datasets/orderly_with_trust_with_map_test.parquet" --output_folder_path="models/with_trust_with_map_20"  --train_fraction=0.2 --train_val_split=0.8 --overwrite=False --epochs=20 --evaluate_on_test_data=True --early_stopping_patience=5 --wandb_entity=WANDB_ENTITY
 
+
 # Sweeps
-sweep_no_trust_no_map_train:
-	python -m sweep sweeps/no_trust_no_map_train.yaml --max_parallel 1
+TRAIN_FRACS =  0.2 0.4 0.6 0.8 1.0
+DATASETS_PATH = /project/studios/orderly-preprocessing/ORDerly/data/orderly/datasets/
+DATASETS = no_trust_no_map no_trust_with_map with_trust_no_map with_trust_with_map
+dataset_size_sweep:
+	@for dataset in ${DATASETS}; \
+	do \
+		for train_frac in ${TRAIN_FRACS}; \
+		do \
+			rm -rf .tf_cache* && python -m condition_prediction --train_data_path=${DATASETS_PATH}/orderly_$${dataset}_train.parquet --test_data_path=${DATASETS_PATH}/orderly_$${dataset}_test.parquet --output_folder_path=models/$${dataset} --train_fraction=$${train_frac} --train_val_split=0.8 --overwrite=True --batch_size=512 --epochs=100 --early_stopping_patience=0  --evaluate_on_test_data=True --wandb_entity="ceb-sre"; \
+		done \
+	done
+
+
+sweep_no_trust_no_map_train_commands:
+	python -m sweep sweeps/no_trust_no_map_train.yaml --dry_run
 
-sweep_no_trust_with_map_train:
-	python -m sweep sweeps/no_trust_with_map_train.yaml --max_parallel 1
+sweep_no_trust_with_map_train_commands:
+	python -m sweep sweeps/no_trust_with_map_train.yaml --dry_run
 
-sweep_with_trust_no_map_train:
-	python -m sweep sweeps/with_trust_no_map_train.yaml --max_parallel 1
+sweep_with_trust_no_map_train_commands:
+	python -m sweep sweeps/with_trust_no_map_train.yaml --dry_run
 
-sweep_with_trust_with_map_train:
-	python -m sweep sweeps/with_trust_with_map_train.yaml --max_parallel 1
+sweep_with_trust_with_map_train_commands:
+	python -m sweep sweeps/with_trust_with_map_train.yaml --dry_run
 
+sweep_all: sweep_no_trust_no_map_train_commands sweep_no_trust_with_map_train_commands sweep_with_trust_no_map_train_commands sweep_with_trust_with_map_train_commands
 
 train_all: no_trust_no_map_train no_trust_with_map_train with_trust_no_map_train with_trust_with_map_train no_trust_no_map_train_20 no_trust_with_map_train_20 with_trust_no_map_train_20 with_trust_with_map_train_20
 

diff --git a/condition_prediction/condition_prediction/data_generator.py b/condition_prediction/condition_prediction/data_generator.py
@@ -39,6 +39,10 @@ class GenerateData:
     mol4: NDArray[np.float32]
     mol5: NDArray[np.float32]
 
+    # def __post_init__(self):
+    #     initializer = lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)
+    #     self.pool = multiprocessing.Pool(os.cpu_count(), initializer)
+
     def map_idx_to_data(self, idx):
         idx = idx.numpy()
         if self.product_fp is None and self.rxn_diff_fp is None:
@@ -192,6 +196,10 @@ def get_dataset(
     # Construct outputs
     if fp is None and df is None:
         raise ValueError("Must provide either df or fp")
+    elif fp is not None and df is not None and fp.shape[0] != df.shape[0]:
+        raise ValueError(
+            f"Fingerprint ({fp.shape}) and dataframe ({df.shape}) not the same size"
+        )
 
     if fp is not None:
         product_fp = fp[:, : fp.shape[1] // 2]
@@ -244,15 +252,6 @@ def map_func(idx):
         # num_parallel_calls=os.cpu_count(), deterministic=False
     )
 
-    if cache_data:
-        cache_dir = Path(cache_dir)
-        if not cache_dir.exists():
-            cache_dir.mkdir(exist_ok=True)
-            # # Read through dataset once to cache it
-            # print("Caching dataset")
-            # [1 for _ in dataset.as_numpy_iterator()]
-        dataset = dataset.cache(filename=str(cache_dir / "fps"))
-
     # ensures shape is correct after batching
     # See https://github.com/tensorflow/tensorflow/issues/32912#issuecomment-550363802
     def _fixup_shape(X, Y):
@@ -264,6 +263,26 @@ def _fixup_shape(X, Y):
 
     dataset = dataset.map(_fixup_shape)
 
+    if cache_data:
+        cache_dir = Path(cache_dir)
+        if not cache_dir.exists():
+            cache_dir.mkdir(exist_ok=True)
+            # Read through dataset once to cache it
+            print("Caching dataset")
+            [1 for _ in dataset.as_numpy_iterator()]
+        # dataset = dataset.cache(filename=str(cache_dir / "fps"))
+        dataset = dataset.cache()
+
+    if cache_data:
+        cache_dir = Path(cache_dir)
+        if not cache_dir.exists():
+            cache_dir.mkdir(exist_ok=True)
+            # Read through dataset once to cache it
+            print("Caching dataset")
+            [1 for _ in dataset.as_numpy_iterator()]
+        dataset = dataset.cache(filename=str(cache_dir / "fps"))
+        # dataset = dataset.cache()
+
     if interleave:
         dataset = tf.data.Dataset.range(len(dataset)).interleave(
             lambda _: dataset,
@@ -275,6 +294,7 @@ def _fixup_shape(X, Y):
     if prefetch_buffer_size is None:
         prefetch_buffer_size = AUTOTUNE
     dataset = dataset.prefetch(buffer_size=prefetch_buffer_size)
+    print("Prefetch buffer size:", prefetch_buffer_size)
     return dataset
 
 
@@ -294,6 +314,7 @@ def get_datasets(
     cache_train_data: bool = False,
     cache_val_data: bool = False,
     cache_test_data: bool = False,
+    interleave: bool = False,
 ):
     """
     Get data generators for train, val and test
@@ -390,13 +411,15 @@ def get_datasets(
         train_mol4,
         train_mol5,
         df=df.iloc[train_idx],
-        fp=train_val_fp,
+        fp=train_val_fp[train_idx] if train_val_fp is not None else None,
         mode=train_mode,
         fp_size=fp_size,
         shuffle=True,
         batch_size=batch_size,
         shuffle_buffer_size=shuffle_buffer_size,
         cache_data=cache_train_data,
+        prefetch_buffer_size=prefetch_buffer_size,
+        interleave=interleave,
         cache_dir=".tf_cache_train/",
     )
     val_dataset = get_dataset(
@@ -406,12 +429,14 @@ def get_datasets(
         val_mol4,
         val_mol5,
         df=df.iloc[val_idx],
-        fp=train_val_fp,
+        fp=train_val_fp[val_idx] if train_val_fp is not None else None,
         mode=val_mode,
         fp_size=fp_size,
         shuffle=False,
         batch_size=batch_size,
         shuffle_buffer_size=shuffle_buffer_size,
+        prefetch_buffer_size=prefetch_buffer_size,
+        interleave=interleave,
         cache_data=cache_val_data,
         cache_dir=".tf_cache_val/",
     )
@@ -428,6 +453,8 @@ def get_datasets(
         shuffle=False,
         batch_size=batch_size,
         shuffle_buffer_size=shuffle_buffer_size,
+        prefetch_buffer_size=prefetch_buffer_size,
+        interleave=interleave,
         cache_data=cache_test_data,
         cache_dir=".tf_cache_test/",
     )