From 47d92d874dfd12031341a2b50a3534c1dfc028a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 27 Nov 2024 14:21:27 +0100
Subject: [PATCH] feat: retain only last epoch directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduced a new command-line argument `--keep_last_epoch_only`. This
flag determines whether we should only keep the last epoch directory,
with the previous epoch directories always being overwritten. When
this flag is enabled, the epoch directory is named `last_epoch`.

This flag is useful for managing disk space efficiently during model
training. By keeping only the last epoch directory and overwriting the
previous ones, it helps to significantly reduce the amount of storage
required. This is particularly beneficial when working with large models
and datasets, where each epoch can consume a substantial amount of disk
space. By enabling the --keep_last_epoch_only flag, users can ensure
that only the most recent model state is saved, which is often
sufficient for many training and evaluation purposes. This approach
helps to avoid clutter and maintain a cleaner and more manageable file
system.

Given the fact that we always pick epoch 7 during phase 1 training and
do not perform evaluation on each epoch, one might decide it is not
worth to save all epochs. By keeping only the last epoch, we can
significantly reduce the amount of storage required, avoid clutter, and
maintain a cleaner and more manageable file system.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 README.md                           |  1 +
 src/instructlab/training/config.py  |  5 +++++
 src/instructlab/training/main_ds.py | 12 ++++++++++++
 src/instructlab/training/utils.py   |  9 +++++++--
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c6435542..f089ceff 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,7 @@ for training jobs. There are a number of options you can specify, such as settin
 | fsdp_options | The settings for controlling FSDP when it's selected as the distributed backend. |
 | distributed_backend | Specifies which distributed training backend to use. Supported options are "fsdp" and "deepspeed". |
 | disable_flash_attn | Disables flash attention when set to true. This allows for training on older devices. |
+| keep_last_epoch_only | Determines whether we should only keep the last epoch directory - the previous epoch directory is always overwritten. The epoch directory is called `last_epoch`. |
 
 ### `DeepSpeedOptions`
 
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index bf43f2eb..b877a884 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -206,3 +206,8 @@ class TrainingArgs(BaseModel):
 
     # This field defines whether or not data processing will occur inside of `run_training()`
     process_data: Optional[bool] = True
+
+    # This field specifies whether only the last epoch should be retained. When set to true, it will
+    # overwrite the previous epoch directory, keeping only one directory called "last_epoch".
+    # This works alongside the '--checkpoint_at_epoch' flag.
+    keep_last_epoch_only: Optional[bool] = False
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 0ad54c5f..ea7224e2 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -707,6 +707,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         f"--max_batch_len={train_args.max_batch_len}",
         f"--seed={train_args.random_seed}",
         f"--chat-tmpl-path={train_args.chat_tmpl_path}",
+        f"--keep_last_epoch_only={train_args.keep_last_epoch_only}",
     ]
 
     if train_args.checkpoint_at_epoch:
@@ -787,6 +788,9 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         f"--fsdp_sharding_strategy={train_args.fsdp_options.sharding_strategy.value}"
     )
 
+    if train_args.keep_last_epoch_only:
+        command.append("--keep_last_epoch_only")
+
     print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m")
     process = None
     interrupt: KeyboardInterrupt | Exception | None = None
@@ -962,6 +966,14 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         ),
     )
     parser.add_argument("--disable_flash_attn", action="store_true")
+    parser.add_argument(
+        "--keep_last_epoch_only",
+        action="store_true",
+        help=(
+            "Keep only the last epoch directory - overwrite the previous ones. Useful for saving disk space."
+            "The last epoch will be saved as 'last_epoch'."
+        ),
+    )
     args = parser.parse_args()
     set_random_seed(args.seed)
     main(args)
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 41ec413f..6ffefae6 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -925,8 +925,11 @@ def save_hf_format_accelerate(
     samples_seen,
     is_lora=False,
 ):
+    # Build the subdirectory name
+    subdir = "last_epoch" if args.keep_last_epoch_only else f"samples_{samples_seen}"
+
     log_rank_0(
-        f"\033[93mSaving model in huggingface format at samples_seen: {samples_seen}\033[0m",
+        f"\033[93mSaving model in huggingface format at: {subdir}\033[0m",
         to_print=True,
     )
     start = time.time()
@@ -936,7 +939,9 @@ def save_hf_format_accelerate(
     else:
         convert_dolomite = True
 
-    final_output_dir = Path(args.output_dir) / "hf_format" / f"samples_{samples_seen}"
+    # Build the final output directory path
+    final_output_dir = Path(args.output_dir) / "hf_format" / subdir
+
     if args.use_dolomite and convert_dolomite:
         tmpdir = TemporaryDirectory("w")  # pylint: disable=consider-using-with
         output_dir = Path(tmpdir.name)