shamelessly use 8 parallel executions

Signed-off-by: dafnapension <[email protected]>
IBM · Nov 12, 2024 · 0994fbf · 0994fbf
1 parent 86d8533
commit 0994fbf
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 6 deletions.
diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -18,6 +18,9 @@ jobs:
        HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
        TQDM_DISABLE: "True"
 
+     strategy:
+       matrix:
+         modulo: [0,1,2,3,4, 5, 6, 7]
 
      steps:
      - uses: actions/checkout@v4
@@ -31,4 +34,9 @@ jobs:
      - run: huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
      - name: Run Tests
-       run: python -m unittest discover -s tests/catalog -p "test_*.py"
+       run: |
+         modulo="${{ matrix.modulo }}"
+         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
+         echo "sed -i 's/^modulo = ./modulo = ${modulo}/' tests/catalog/test_preparation.py" > sedit.sh
+         sh sedit.sh
+         python -m unittest tests.catalog.test_preparation
diff --git a/tests/catalog/test_preparation.py b/tests/catalog/test_preparation.py
@@ -20,16 +20,27 @@
 )
 glob_query = os.path.join(project_dir, "prepare", "**", "*.py")
 all_preparation_files = glob.glob(glob_query, recursive=True)
+# Make sure the order in which the tests are run is deterministic
+# Having a different order for local testing and github testing may cause diffs in results.
+all_preparation_files.sort()
+num_par = 8  # num of parallel executions
+logger.critical(
+    f"Over all, {len(all_preparation_files)} files will now be tested over {num_par} parallel processes."
+)
+# the following should be any of modulo num_par: 0,1,2,3,4,5,6,7
+modulo = 1
+all_preparation_files = [
+    file for i, file in enumerate(all_preparation_files) if i % num_par == modulo
+]
 
 
 class TestCatalogPreparation(UnitxtCatalogPreparationTestCase):
     def test_preparations(self):
         logger.info(glob_query)
-        logger.critical(f"Testing preparation files: {all_preparation_files}")
-        # Make sure the order in which the tests are run is deterministic
-        # Having a different order for local testing and github testing may cause diffs in results.
+        logger.critical(
+            f"Testing {len(all_preparation_files)} preparation files: {all_preparation_files}"
+        )
         times = {}
-        all_preparation_files.sort()
         for file in all_preparation_files:
             logger.info(
                 "\n_____________________________________________\n"
@@ -70,6 +81,8 @@ def test_preparations(self):
                 logger.critical(f"Testing preparation file '{file}' failed:")
                 raise e
 
-        logger.critical("Preparation times table:")
+        logger.critical(
+            f"Preparation times table for the {len(times)} files that completed successfully:"
+        )
         times = dict(sorted(times.items(), key=lambda item: item[1], reverse=True))
         print_dict(times, log_level="critical")