From 8bd4698b9e82c1f90c8531b97bf67be40fdc8856 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Mon, 23 Sep 2024 14:51:48 -0400
Subject: [PATCH 1/3] make data processing optional in run_training.

Co-authored-by: Michael Clifford <mcliffor@redhat.com>
Co-authored-by: Shreyanand <shanand@redhat.com>
Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 README.md                                | 164 +++++++++++++++++++++++
 src/instructlab/training/__init__.py     |   4 +-
 src/instructlab/training/data_process.py |   4 +
 src/instructlab/training/main_ds.py      |  36 +++--
 4 files changed, 187 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index f57937e9..1f11efd6 100644
--- a/README.md
+++ b/README.md
@@ -280,3 +280,167 @@ run_training(
     torchrun_args=torchrun_args,
     training_args=training_args,
 )
+
+```
+
+### Customizing `TrainingArgs`
+
+The `TrainingArgs` class provides most of the customization options
+for the training job itself. There are a number of options you can specify, such as setting
+DeepSpeed config values or running a LoRA training job instead of a full fine-tune.
+
+Here is a breakdown of the general options:
+
+| Field | Description |
+| --- | --- |
+| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format.  |
+| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format.  |
+| ckpt_output_dir | Directory where trained model checkpoints will be saved. |
+| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) |
+|  max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. |
+| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. |
+| num_epochs | Number of epochs to run through before stopping. |
+| effective_batch_size | The amount of samples in a batch to see before we update the model parameters. |
+| save_samples | Number of samples the model should see before saving a checkpoint. Consider this to be the checkpoint save frequency. |
+| learning_rate | How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. |
+| warmup_steps | The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to `learning_rate`. |
+| is_padding_free | Boolean value to indicate whether or not we're training a padding-free transformer model such as Granite. |
+| random_seed | The random seed PyTorch will use. |
+| mock_data | Whether or not to use mock, randomly generated,  data during training. For debug purposes |
+| mock_data_len | Max length of a single mock data sample. Equivalent to `max_seq_len` but for mock data. |
+| deepspeed_options | Config options to specify for the DeepSpeed optimizer. |
+| lora | Options to specify if you intend to perform a LoRA train instead of a full fine-tune. |
+
+#### `DeepSpeedOptions`
+
+We only currently support a few options in `DeepSpeedOptions`:
+The default is to run with DeepSpeed, so these options only currently
+allow you to customize aspects of the ZeRO stage 2 optimizer.
+
+| Field | Description |
+| --- | --- |
+| cpu_offload_optimizer | Whether or not to do CPU offloading in DeepSpeed stage 2. |
+
+#### `loraOptions`
+
+If you'd like to do a LoRA train, you can specify a LoRA
+option to `TrainingArgs` via the `LoraOptions` object.
+
+```python
+from instructlab.training import LoraOptions, TrainingArgs
+
+training_args = TrainingArgs(
+    lora = LoraOptions(
+        rank = 4,
+        alpha = 32,
+        dropout = 0.1,
+    ),
+    # ...
+)
+```
+
+Here is the definition for what we currently support today:
+
+| Field | Description |
+| --- | --- |
+| rank | The rank parameter for LoRA training. |
+| alpha | The alpha parameter for LoRA training. |
+| dropout | The dropout rate for LoRA training. |
+| target_modules | The list of target modules for LoRA training. |
+| quantize_data_type | The data type for quantization in LoRA training. Valid options are `None` and `"nf4"` |
+
+### Customizing `TorchrunArgs`
+
+When running the training script, we always invoke `torchrun`.
+
+If you are running a single-GPU system or something that doesn't
+otherwise require distributed training configuration, you can
+just create a default object:
+
+```python
+run_training(
+    torchrun_args=TorchrunArgs(),
+    training_args=TrainingArgs(
+        # ...
+    ),
+)
+```
+
+However, if you want to specify a more complex configuration,
+we currently expose all of the options that [torchrun accepts
+today](https://pytorch.org/docs/stable/elastic/run.html#definitions).
+
+> ![NOTE]
+> For more information about the `torchrun` arguments, please consult the [torchrun documentation](https://pytorch.org/docs/stable/elastic/run.html#definitions).
+
+For example, in a 8-GPU, 2-machine system, we would
+specify the following torchrun config:
+
+```python
+MASTER_ADDR = os.getenv('MASTER_ADDR')
+MASTER_PORT = os.getnev('MASTER_PORT')
+RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}'
+
+# on machine 1
+torchrun_args = TorchrunArgs(
+    nnodes = 2, # number of machines 
+    nproc_per_node = 4, # num GPUs per machine
+    node_rank = 0, # node rank for this machine
+    rdzv_id = 123,
+    rdzv_endpoint = RDZV_ENDPOINT
+)
+
+run_training(
+    torchrun_args=torchrun_args,
+    training_args=training_args
+)
+```
+
+```python
+MASTER_ADDR = os.getenv('MASTER_ADDR')
+MASTER_PORT = os.getnev('MASTER_PORT')
+RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}'
+
+# on machine 2
+torchrun_args = TorchrunArgs(
+    nnodes = 2, # number of machines 
+    nproc_per_node = 4, # num GPUs per machine
+    node_rank = 1, # node rank for this machine
+    rdzv_id = 123,
+    rdzv_endpoint = f'{MASTER_ADDR}:{MASTER_PORT}'
+)
+
+run_training(
+    torch_args=torchrun_args,
+    train_args=training_args
+)
+```
+If the machine's above have shared storage, users can preprocess the training dataset a single time so that it can then distributed to each machine with the following update:
+
+```python
+from instructlab.training import (
+    run_training,
+    TorchrunArgs,
+    TrainingArgs,
+    DeepSpeedOptions,
+    DataProcessArgs,
+    data_process as dp
+)
+
+...
+
+data_process_args = DataProcessArgs(
+    data_output_path = training_args.data_output_dir,
+    model_path = training_args.model_path,
+    data_path = training_args.data_path,
+    max_seq_len = training_args.max_seq_len,
+    chat_tmpl_path =  training_args.chat_tmpl_path
+)
+
+dp.main(data_process_args)
+run_training(
+    torch_args=torchrun_args,
+    train_args=training_args,
+    process_data = False
+)
+```
diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py
index a2ed292a..499625a5 100644
--- a/src/instructlab/training/__init__.py
+++ b/src/instructlab/training/__init__.py
@@ -28,9 +28,9 @@
 
 
 # defer import of main_ds
-def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
+def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None:
     """Wrapper around the main training job that calls torchrun."""
     # Local
     from .main_ds import run_training
 
-    return run_training(torch_args=torch_args, train_args=train_args)
+    return run_training(torch_args=torch_args, train_args=train_args, process_data=process_data)
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 2e6cd393..9b332d36 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -221,6 +221,10 @@ def get_masked_and_orig_text(sample):
 
 
 def main(args: DataProcessArgs):
+    
+    if not os.path.exists(args.data_output_path):
+        os.makedirs(args.data_output_path, exist_ok=True)
+    
     print("\033[92m data arguments are:\033[0m")
     print("\033[36m" + args.model_dump_json() + "\033[0m")
     NUM_PROC = args.num_cpu_procs
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 07684a9a..12d1b6bf 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -635,7 +635,7 @@ def main(args):
 
 
 # public API
-def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
+def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None:
     """
     Wrapper around the main training job that calls torchrun.
     """
@@ -644,25 +644,23 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         raise ValueError(
             f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}"
         )
-
-    # process the training data
-    if not os.path.exists(train_args.data_output_dir):
-        os.makedirs(train_args.data_output_dir, exist_ok=True)
-    dp.main(
-        DataProcessArgs(
-            # XXX(osilkin): make a decision here, either:
-            #   1. the CLI is fully responsible for managing where the data is written
-            #   2. we never cache it and simply write it to a tmp file every time.
-            #
-            # An important reason for why #1 would be preferable is in the case of OpenShift/SELinux
-            # where the user has a defined place for new temporary data to be written.
-            data_output_path=train_args.data_output_dir,
-            model_path=train_args.model_path,
-            data_path=train_args.data_path,
-            max_seq_len=train_args.max_seq_len,
-            chat_tmpl_path=train_args.chat_tmpl_path,
+    
+    if process_data:
+        dp.main(
+            DataProcessArgs(
+                # XXX(osilkin): make a decision here, either:
+                #   1. the CLI is fully responsible for managing where the data is written
+                #   2. we never cache it and simply write it to a tmp file every time.
+                #
+                # An important reason for why #1 would be preferable is in the case of OpenShift/SELinux
+                # where the user has a defined place for new temporary data to be written.
+                data_output_path=train_args.data_output_dir,
+                model_path=train_args.model_path,
+                data_path=train_args.data_path,
+                max_seq_len=train_args.max_seq_len,
+                chat_tmpl_path=train_args.chat_tmpl_path,
+            )
         )
-    )
 
     if not os.path.exists(train_args.ckpt_output_dir):
         os.makedirs(train_args.ckpt_output_dir, exist_ok=True)

From aefde0efc0c912a11251b29658b56ec104f3d6e7 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Mon, 23 Sep 2024 17:10:23 -0400
Subject: [PATCH 2/3] pre-commit formatting

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 README.md                                | 134 +----------------------
 src/instructlab/training/__init__.py     |   8 +-
 src/instructlab/training/data_process.py |   2 -
 src/instructlab/training/main_ds.py      |   6 +-
 4 files changed, 11 insertions(+), 139 deletions(-)

diff --git a/README.md b/README.md
index 1f11efd6..40869a59 100644
--- a/README.md
+++ b/README.md
@@ -283,139 +283,7 @@ run_training(
 
 ```
 
-### Customizing `TrainingArgs`
-
-The `TrainingArgs` class provides most of the customization options
-for the training job itself. There are a number of options you can specify, such as setting
-DeepSpeed config values or running a LoRA training job instead of a full fine-tune.
-
-Here is a breakdown of the general options:
-
-| Field | Description |
-| --- | --- |
-| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format.  |
-| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format.  |
-| ckpt_output_dir | Directory where trained model checkpoints will be saved. |
-| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) |
-|  max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. |
-| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. |
-| num_epochs | Number of epochs to run through before stopping. |
-| effective_batch_size | The amount of samples in a batch to see before we update the model parameters. |
-| save_samples | Number of samples the model should see before saving a checkpoint. Consider this to be the checkpoint save frequency. |
-| learning_rate | How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. |
-| warmup_steps | The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to `learning_rate`. |
-| is_padding_free | Boolean value to indicate whether or not we're training a padding-free transformer model such as Granite. |
-| random_seed | The random seed PyTorch will use. |
-| mock_data | Whether or not to use mock, randomly generated,  data during training. For debug purposes |
-| mock_data_len | Max length of a single mock data sample. Equivalent to `max_seq_len` but for mock data. |
-| deepspeed_options | Config options to specify for the DeepSpeed optimizer. |
-| lora | Options to specify if you intend to perform a LoRA train instead of a full fine-tune. |
-
-#### `DeepSpeedOptions`
-
-We only currently support a few options in `DeepSpeedOptions`:
-The default is to run with DeepSpeed, so these options only currently
-allow you to customize aspects of the ZeRO stage 2 optimizer.
-
-| Field | Description |
-| --- | --- |
-| cpu_offload_optimizer | Whether or not to do CPU offloading in DeepSpeed stage 2. |
-
-#### `loraOptions`
-
-If you'd like to do a LoRA train, you can specify a LoRA
-option to `TrainingArgs` via the `LoraOptions` object.
-
-```python
-from instructlab.training import LoraOptions, TrainingArgs
-
-training_args = TrainingArgs(
-    lora = LoraOptions(
-        rank = 4,
-        alpha = 32,
-        dropout = 0.1,
-    ),
-    # ...
-)
-```
-
-Here is the definition for what we currently support today:
-
-| Field | Description |
-| --- | --- |
-| rank | The rank parameter for LoRA training. |
-| alpha | The alpha parameter for LoRA training. |
-| dropout | The dropout rate for LoRA training. |
-| target_modules | The list of target modules for LoRA training. |
-| quantize_data_type | The data type for quantization in LoRA training. Valid options are `None` and `"nf4"` |
-
-### Customizing `TorchrunArgs`
-
-When running the training script, we always invoke `torchrun`.
-
-If you are running a single-GPU system or something that doesn't
-otherwise require distributed training configuration, you can
-just create a default object:
-
-```python
-run_training(
-    torchrun_args=TorchrunArgs(),
-    training_args=TrainingArgs(
-        # ...
-    ),
-)
-```
-
-However, if you want to specify a more complex configuration,
-we currently expose all of the options that [torchrun accepts
-today](https://pytorch.org/docs/stable/elastic/run.html#definitions).
-
-> ![NOTE]
-> For more information about the `torchrun` arguments, please consult the [torchrun documentation](https://pytorch.org/docs/stable/elastic/run.html#definitions).
-
-For example, in a 8-GPU, 2-machine system, we would
-specify the following torchrun config:
-
-```python
-MASTER_ADDR = os.getenv('MASTER_ADDR')
-MASTER_PORT = os.getnev('MASTER_PORT')
-RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}'
-
-# on machine 1
-torchrun_args = TorchrunArgs(
-    nnodes = 2, # number of machines 
-    nproc_per_node = 4, # num GPUs per machine
-    node_rank = 0, # node rank for this machine
-    rdzv_id = 123,
-    rdzv_endpoint = RDZV_ENDPOINT
-)
-
-run_training(
-    torchrun_args=torchrun_args,
-    training_args=training_args
-)
-```
-
-```python
-MASTER_ADDR = os.getenv('MASTER_ADDR')
-MASTER_PORT = os.getnev('MASTER_PORT')
-RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}'
-
-# on machine 2
-torchrun_args = TorchrunArgs(
-    nnodes = 2, # number of machines 
-    nproc_per_node = 4, # num GPUs per machine
-    node_rank = 1, # node rank for this machine
-    rdzv_id = 123,
-    rdzv_endpoint = f'{MASTER_ADDR}:{MASTER_PORT}'
-)
-
-run_training(
-    torch_args=torchrun_args,
-    train_args=training_args
-)
-```
-If the machine's above have shared storage, users can preprocess the training dataset a single time so that it can then distributed to each machine with the following update:
+If the machines above have shared storage, users can preprocess the training dataset a single time so that it can then be distributed to each machine with the following update:
 
 ```python
 from instructlab.training import (
diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py
index 499625a5..7b4bf5e3 100644
--- a/src/instructlab/training/__init__.py
+++ b/src/instructlab/training/__init__.py
@@ -28,9 +28,13 @@
 
 
 # defer import of main_ds
-def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None:
+def run_training(
+    torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True
+) -> None:
     """Wrapper around the main training job that calls torchrun."""
     # Local
     from .main_ds import run_training
 
-    return run_training(torch_args=torch_args, train_args=train_args, process_data=process_data)
+    return run_training(
+        torch_args=torch_args, train_args=train_args, process_data=process_data
+    )
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 9b332d36..4bd7c789 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -221,10 +221,8 @@ def get_masked_and_orig_text(sample):
 
 
 def main(args: DataProcessArgs):
-    
     if not os.path.exists(args.data_output_path):
         os.makedirs(args.data_output_path, exist_ok=True)
-    
     print("\033[92m data arguments are:\033[0m")
     print("\033[36m" + args.model_dump_json() + "\033[0m")
     NUM_PROC = args.num_cpu_procs
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 12d1b6bf..710fcd52 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -635,7 +635,9 @@ def main(args):
 
 
 # public API
-def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None:
+def run_training(
+    torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True
+) -> None:
     """
     Wrapper around the main training job that calls torchrun.
     """
@@ -644,7 +646,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_dat
         raise ValueError(
             f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}"
         )
-    
+
     if process_data:
         dp.main(
             DataProcessArgs(

From f97dca380edb430b20c4b429367edcc8e3dda9ae Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Sat, 5 Oct 2024 14:42:06 -0400
Subject: [PATCH 3/3] move process_data arg into TrainingArgs

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 README.md                            | 24 ++++++++++++++++++++++--
 src/instructlab/training/__init__.py |  8 ++------
 src/instructlab/training/config.py   |  3 +++
 src/instructlab/training/main_ds.py  |  6 ++----
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 40869a59..645583f3 100644
--- a/README.md
+++ b/README.md
@@ -283,7 +283,9 @@ run_training(
 
 ```
 
-If the machines above have shared storage, users can preprocess the training dataset a single time so that it can then be distributed to each machine with the following update:
+## Example training with separate data pre-processing
+
+If the machines in the example above have shared storage, users can pre-process the training dataset a single time so that it can then be distributed to each machine by making the following updates.
 
 ```python
 from instructlab.training import (
@@ -295,6 +297,25 @@ from instructlab.training import (
     data_process as dp
 )
 
+training_args = TrainingArgs(
+    # define data-specific arguments
+    model_path = "ibm-granite/granite-7b-base",
+    data_path = "path/to/dataset.jsonl",
+    ckpt_output_dir = "data/saved_checkpoints",
+    data_output_dir = "data/outputs",
+
+    # define model-trianing parameters
+    max_seq_len = 4096,
+    max_batch_len = 60000,
+    num_epochs = 10,
+    effective_batch_size = 3840,
+    save_samples = 250000,
+    learning_rate = 2e-6,
+    warmup_steps = 800,
+    is_padding_free = True, # set this to true when using Granite-based models
+    random_seed = 42,
+    process_data = True,
+)
 ...
 
 data_process_args = DataProcessArgs(
@@ -309,6 +330,5 @@ dp.main(data_process_args)
 run_training(
     torch_args=torchrun_args,
     train_args=training_args,
-    process_data = False
 )
 ```
diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py
index 7b4bf5e3..a2ed292a 100644
--- a/src/instructlab/training/__init__.py
+++ b/src/instructlab/training/__init__.py
@@ -28,13 +28,9 @@
 
 
 # defer import of main_ds
-def run_training(
-    torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True
-) -> None:
+def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     """Wrapper around the main training job that calls torchrun."""
     # Local
     from .main_ds import run_training
 
-    return run_training(
-        torch_args=torch_args, train_args=train_args, process_data=process_data
-    )
+    return run_training(torch_args=torch_args, train_args=train_args)
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 03e963f2..05fe4792 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -199,3 +199,6 @@ class TrainingArgs(BaseModel):
     # https://github.com/instructlab/training/issues/28
     # quantize_dtype: QuantizeDataType = QuantizeDataType.NONE
     lora: LoraOptions | None = None
+
+    # This field defines whether or not data processing will occur inside of `run_training()`
+    process_data: Optional[bool] = True
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 710fcd52..c5cdb2ba 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -635,9 +635,7 @@ def main(args):
 
 
 # public API
-def run_training(
-    torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True
-) -> None:
+def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     """
     Wrapper around the main training job that calls torchrun.
     """
@@ -647,7 +645,7 @@ def run_training(
             f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}"
         )
 
-    if process_data:
+    if train_args.process_data:
         dp.main(
             DataProcessArgs(
                 # XXX(osilkin): make a decision here, either: