IMprove CI Performance.

1. Improve generation of hdf5 files it was taking too long. 2. Do not finalize in pytest this allows multiple tests to run together.
argonne-lcf · Aug 30, 2024 · c2af0c2 · c2af0c2
1 parent d6924f3
commit c2af0c2
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 102 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -81,118 +81,51 @@ jobs:
       - name: test_gen_data
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v
-          mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data -v --durations=0
           rm -rf data
       - name: test_custom_storage_root_gen_data
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v
-          mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data -v --durations=0
           rm -rf data
       - name: test_train
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v
-          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v
-          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v
-          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v
-          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train -v --durations=0
           rm -rf data
       - name: test_custom_storage_root_train
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v
-          mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train -v --durations=0
           rm -rf data
       - name: test_checkpoint_epoch
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v
-          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v
-          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v
-          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v
-          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v
-          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch -v --durations=0
           rm -rf data
       - name: test_checkpoint_step
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_checkpoint_step -v
+          mpirun -np 2 pytest -k test_checkpoint_step  -v --durations=0
       - name: test_eval
         run: |
           source ${VENV_PATH}/bin/activate
           mpirun -np 2 pytest -k test_eval -v
       - name: test_multi_threads
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_multi_threads[tensorflow-0]  -v
-          mpirun -np 2 pytest -k test_multi_threads[tensorflow-1]  -v
-          mpirun -np 2 pytest -k test_multi_threads[tensorflow-2]  -v
-          mpirun -np 2 pytest -k test_multi_threads[pytorch-0]  -v
-          mpirun -np 2 pytest -k test_multi_threads[pytorch-1]  -v
-          mpirun -np 2 pytest -k test_multi_threads[pytorch-2]  -v
+          mpirun -np 2 pytest -k test_multi_threads -v --durations=0
           rm -rf data
       - name: test-pytorch-multiprocessing-context
         run: |
           source ${VENV_PATH}/bin/activate
-          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v
-          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v
-          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v
-          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v
+          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context -v --durations=0
           rm -rf data
       - name: test_subset
         run: |
           source ${VENV_PATH}/bin/activate
           rm -rf output data checkpoints
-          mpirun -np 2 pytest -k test_subset -v
+          mpirun -np 2 pytest -k test_subset -v --durations=0
           rm -rf data
       - name: test-tf-loader-tfrecord
         run: |

diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py
@@ -45,38 +45,29 @@ def generate(self):
         """
         super().generate()
         np.random.seed(10)
-        samples_per_iter=max(1, int(self._args.generation_buffer_size/self._args.record_length))
         record_labels = [0] * self.num_samples
         dim = self.get_dimension(self.total_files_to_generate)
+        chunks = None
+        if self.enable_chunking:
+            chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size)))
+            if chunk_dimension > self._dimension:
+                chunk_dimension = self._dimension
+            chunks = (1, chunk_dimension, chunk_dimension)
+        compression = None
+        compression_level = None
+        if self.compression != Compression.NONE:
+            compression = str(self.compression)
+            if self.compression == Compression.GZIP:
+                compression_level = self.compression_level
         for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)):
-            progress(i, self.total_files_to_generate, "Generating HDF5 Data")
             dim1 = dim[2*i]
             dim2 = dim[2*i+1]
-            records = np.random.randint(255, size=(samples_per_iter, dim1, dim2), dtype=np.uint8)
+            records = np.random.randint(255, size=(dim1, dim2, self.num_samples), dtype=np.uint8)
             out_path_spec = self.storage.get_uri(self._file_list[i])
+            progress(i+1, self.total_files_to_generate, "Generating NPZ Data")
             hf = h5py.File(out_path_spec, 'w')
-            chunks = None
-            if self.enable_chunking:
-                chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size)))
-                if chunk_dimension > self._dimension:
-                    chunk_dimension = self._dimension
-                chunks = (1, chunk_dimension, chunk_dimension)
-            compression = None
-            compression_level = None
-            if self.compression != Compression.NONE:
-                compression = str(self.compression)
-                if self.compression == Compression.GZIP:
-                    compression_level = self.compression_level
-            dset = hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression,
-                                     compression_opts=compression_level, dtype=np.uint8)
-            samples_written = 0
-            while samples_written < self.num_samples:
-                if samples_per_iter < self.num_samples-samples_written:
-                    samples_to_write = samples_per_iter
-                else:
-                    samples_to_write = self.num_samples-samples_written
-                dset[samples_written:samples_written+samples_to_write] = records[:samples_to_write]
-                samples_written += samples_to_write
+            hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression,
+                                    compression_opts=compression_level, dtype=np.uint8, data=records)
             hf.create_dataset('labels', data=record_labels)
             hf.close()
         np.random.seed()
diff --git a/tests/dlio_benchmark_test.py b/tests/dlio_benchmark_test.py
@@ -48,7 +48,8 @@ def init():
     DLIOMPI.get_instance().initialize()
 
 def finalize():
-    DLIOMPI.get_instance().finalize()
+    # DLIOMPI.get_instance().finalize()
+    pass
 
 def clean(storage_root="./") -> None:
     comm.Barrier()