From 8bd4698b9e82c1f90c8531b97bf67be40fdc8856 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Mon, 23 Sep 2024 14:51:48 -0400 Subject: [PATCH 1/3] make data processing optional in run_training. Co-authored-by: Michael Clifford Co-authored-by: Shreyanand Signed-off-by: Michael Clifford --- README.md | 164 +++++++++++++++++++++++ src/instructlab/training/__init__.py | 4 +- src/instructlab/training/data_process.py | 4 + src/instructlab/training/main_ds.py | 36 +++-- 4 files changed, 187 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f57937e9..1f11efd6 100644 --- a/README.md +++ b/README.md @@ -280,3 +280,167 @@ run_training( torchrun_args=torchrun_args, training_args=training_args, ) + +``` + +### Customizing `TrainingArgs` + +The `TrainingArgs` class provides most of the customization options +for the training job itself. There are a number of options you can specify, such as setting +DeepSpeed config values or running a LoRA training job instead of a full fine-tune. + +Here is a breakdown of the general options: + +| Field | Description | +| --- | --- | +| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format. | +| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format. | +| ckpt_output_dir | Directory where trained model checkpoints will be saved. | +| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) | +| max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. | +| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. | +| num_epochs | Number of epochs to run through before stopping. | +| effective_batch_size | The amount of samples in a batch to see before we update the model parameters. | +| save_samples | Number of samples the model should see before saving a checkpoint. Consider this to be the checkpoint save frequency. | +| learning_rate | How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. | +| warmup_steps | The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to `learning_rate`. | +| is_padding_free | Boolean value to indicate whether or not we're training a padding-free transformer model such as Granite. | +| random_seed | The random seed PyTorch will use. | +| mock_data | Whether or not to use mock, randomly generated, data during training. For debug purposes | +| mock_data_len | Max length of a single mock data sample. Equivalent to `max_seq_len` but for mock data. | +| deepspeed_options | Config options to specify for the DeepSpeed optimizer. | +| lora | Options to specify if you intend to perform a LoRA train instead of a full fine-tune. | + +#### `DeepSpeedOptions` + +We only currently support a few options in `DeepSpeedOptions`: +The default is to run with DeepSpeed, so these options only currently +allow you to customize aspects of the ZeRO stage 2 optimizer. + +| Field | Description | +| --- | --- | +| cpu_offload_optimizer | Whether or not to do CPU offloading in DeepSpeed stage 2. | + +#### `loraOptions` + +If you'd like to do a LoRA train, you can specify a LoRA +option to `TrainingArgs` via the `LoraOptions` object. + +```python +from instructlab.training import LoraOptions, TrainingArgs + +training_args = TrainingArgs( + lora = LoraOptions( + rank = 4, + alpha = 32, + dropout = 0.1, + ), + # ... +) +``` + +Here is the definition for what we currently support today: + +| Field | Description | +| --- | --- | +| rank | The rank parameter for LoRA training. | +| alpha | The alpha parameter for LoRA training. | +| dropout | The dropout rate for LoRA training. | +| target_modules | The list of target modules for LoRA training. | +| quantize_data_type | The data type for quantization in LoRA training. Valid options are `None` and `"nf4"` | + +### Customizing `TorchrunArgs` + +When running the training script, we always invoke `torchrun`. + +If you are running a single-GPU system or something that doesn't +otherwise require distributed training configuration, you can +just create a default object: + +```python +run_training( + torchrun_args=TorchrunArgs(), + training_args=TrainingArgs( + # ... + ), +) +``` + +However, if you want to specify a more complex configuration, +we currently expose all of the options that [torchrun accepts +today](https://pytorch.org/docs/stable/elastic/run.html#definitions). + +> ![NOTE] +> For more information about the `torchrun` arguments, please consult the [torchrun documentation](https://pytorch.org/docs/stable/elastic/run.html#definitions). + +For example, in a 8-GPU, 2-machine system, we would +specify the following torchrun config: + +```python +MASTER_ADDR = os.getenv('MASTER_ADDR') +MASTER_PORT = os.getnev('MASTER_PORT') +RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}' + +# on machine 1 +torchrun_args = TorchrunArgs( + nnodes = 2, # number of machines + nproc_per_node = 4, # num GPUs per machine + node_rank = 0, # node rank for this machine + rdzv_id = 123, + rdzv_endpoint = RDZV_ENDPOINT +) + +run_training( + torchrun_args=torchrun_args, + training_args=training_args +) +``` + +```python +MASTER_ADDR = os.getenv('MASTER_ADDR') +MASTER_PORT = os.getnev('MASTER_PORT') +RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}' + +# on machine 2 +torchrun_args = TorchrunArgs( + nnodes = 2, # number of machines + nproc_per_node = 4, # num GPUs per machine + node_rank = 1, # node rank for this machine + rdzv_id = 123, + rdzv_endpoint = f'{MASTER_ADDR}:{MASTER_PORT}' +) + +run_training( + torch_args=torchrun_args, + train_args=training_args +) +``` +If the machine's above have shared storage, users can preprocess the training dataset a single time so that it can then distributed to each machine with the following update: + +```python +from instructlab.training import ( + run_training, + TorchrunArgs, + TrainingArgs, + DeepSpeedOptions, + DataProcessArgs, + data_process as dp +) + +... + +data_process_args = DataProcessArgs( + data_output_path = training_args.data_output_dir, + model_path = training_args.model_path, + data_path = training_args.data_path, + max_seq_len = training_args.max_seq_len, + chat_tmpl_path = training_args.chat_tmpl_path +) + +dp.main(data_process_args) +run_training( + torch_args=torchrun_args, + train_args=training_args, + process_data = False +) +``` diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py index a2ed292a..499625a5 100644 --- a/src/instructlab/training/__init__.py +++ b/src/instructlab/training/__init__.py @@ -28,9 +28,9 @@ # defer import of main_ds -def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: +def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None: """Wrapper around the main training job that calls torchrun.""" # Local from .main_ds import run_training - return run_training(torch_args=torch_args, train_args=train_args) + return run_training(torch_args=torch_args, train_args=train_args, process_data=process_data) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 2e6cd393..9b332d36 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -221,6 +221,10 @@ def get_masked_and_orig_text(sample): def main(args: DataProcessArgs): + + if not os.path.exists(args.data_output_path): + os.makedirs(args.data_output_path, exist_ok=True) + print("\033[92m data arguments are:\033[0m") print("\033[36m" + args.model_dump_json() + "\033[0m") NUM_PROC = args.num_cpu_procs diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 07684a9a..12d1b6bf 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -635,7 +635,7 @@ def main(args): # public API -def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: +def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None: """ Wrapper around the main training job that calls torchrun. """ @@ -644,25 +644,23 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: raise ValueError( f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}" ) - - # process the training data - if not os.path.exists(train_args.data_output_dir): - os.makedirs(train_args.data_output_dir, exist_ok=True) - dp.main( - DataProcessArgs( - # XXX(osilkin): make a decision here, either: - # 1. the CLI is fully responsible for managing where the data is written - # 2. we never cache it and simply write it to a tmp file every time. - # - # An important reason for why #1 would be preferable is in the case of OpenShift/SELinux - # where the user has a defined place for new temporary data to be written. - data_output_path=train_args.data_output_dir, - model_path=train_args.model_path, - data_path=train_args.data_path, - max_seq_len=train_args.max_seq_len, - chat_tmpl_path=train_args.chat_tmpl_path, + + if process_data: + dp.main( + DataProcessArgs( + # XXX(osilkin): make a decision here, either: + # 1. the CLI is fully responsible for managing where the data is written + # 2. we never cache it and simply write it to a tmp file every time. + # + # An important reason for why #1 would be preferable is in the case of OpenShift/SELinux + # where the user has a defined place for new temporary data to be written. + data_output_path=train_args.data_output_dir, + model_path=train_args.model_path, + data_path=train_args.data_path, + max_seq_len=train_args.max_seq_len, + chat_tmpl_path=train_args.chat_tmpl_path, + ) ) - ) if not os.path.exists(train_args.ckpt_output_dir): os.makedirs(train_args.ckpt_output_dir, exist_ok=True) From aefde0efc0c912a11251b29658b56ec104f3d6e7 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Mon, 23 Sep 2024 17:10:23 -0400 Subject: [PATCH 2/3] pre-commit formatting Signed-off-by: Michael Clifford --- README.md | 134 +---------------------- src/instructlab/training/__init__.py | 8 +- src/instructlab/training/data_process.py | 2 - src/instructlab/training/main_ds.py | 6 +- 4 files changed, 11 insertions(+), 139 deletions(-) diff --git a/README.md b/README.md index 1f11efd6..40869a59 100644 --- a/README.md +++ b/README.md @@ -283,139 +283,7 @@ run_training( ``` -### Customizing `TrainingArgs` - -The `TrainingArgs` class provides most of the customization options -for the training job itself. There are a number of options you can specify, such as setting -DeepSpeed config values or running a LoRA training job instead of a full fine-tune. - -Here is a breakdown of the general options: - -| Field | Description | -| --- | --- | -| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format. | -| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format. | -| ckpt_output_dir | Directory where trained model checkpoints will be saved. | -| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) | -| max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. | -| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. | -| num_epochs | Number of epochs to run through before stopping. | -| effective_batch_size | The amount of samples in a batch to see before we update the model parameters. | -| save_samples | Number of samples the model should see before saving a checkpoint. Consider this to be the checkpoint save frequency. | -| learning_rate | How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective batch size. | -| warmup_steps | The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to `learning_rate`. | -| is_padding_free | Boolean value to indicate whether or not we're training a padding-free transformer model such as Granite. | -| random_seed | The random seed PyTorch will use. | -| mock_data | Whether or not to use mock, randomly generated, data during training. For debug purposes | -| mock_data_len | Max length of a single mock data sample. Equivalent to `max_seq_len` but for mock data. | -| deepspeed_options | Config options to specify for the DeepSpeed optimizer. | -| lora | Options to specify if you intend to perform a LoRA train instead of a full fine-tune. | - -#### `DeepSpeedOptions` - -We only currently support a few options in `DeepSpeedOptions`: -The default is to run with DeepSpeed, so these options only currently -allow you to customize aspects of the ZeRO stage 2 optimizer. - -| Field | Description | -| --- | --- | -| cpu_offload_optimizer | Whether or not to do CPU offloading in DeepSpeed stage 2. | - -#### `loraOptions` - -If you'd like to do a LoRA train, you can specify a LoRA -option to `TrainingArgs` via the `LoraOptions` object. - -```python -from instructlab.training import LoraOptions, TrainingArgs - -training_args = TrainingArgs( - lora = LoraOptions( - rank = 4, - alpha = 32, - dropout = 0.1, - ), - # ... -) -``` - -Here is the definition for what we currently support today: - -| Field | Description | -| --- | --- | -| rank | The rank parameter for LoRA training. | -| alpha | The alpha parameter for LoRA training. | -| dropout | The dropout rate for LoRA training. | -| target_modules | The list of target modules for LoRA training. | -| quantize_data_type | The data type for quantization in LoRA training. Valid options are `None` and `"nf4"` | - -### Customizing `TorchrunArgs` - -When running the training script, we always invoke `torchrun`. - -If you are running a single-GPU system or something that doesn't -otherwise require distributed training configuration, you can -just create a default object: - -```python -run_training( - torchrun_args=TorchrunArgs(), - training_args=TrainingArgs( - # ... - ), -) -``` - -However, if you want to specify a more complex configuration, -we currently expose all of the options that [torchrun accepts -today](https://pytorch.org/docs/stable/elastic/run.html#definitions). - -> ![NOTE] -> For more information about the `torchrun` arguments, please consult the [torchrun documentation](https://pytorch.org/docs/stable/elastic/run.html#definitions). - -For example, in a 8-GPU, 2-machine system, we would -specify the following torchrun config: - -```python -MASTER_ADDR = os.getenv('MASTER_ADDR') -MASTER_PORT = os.getnev('MASTER_PORT') -RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}' - -# on machine 1 -torchrun_args = TorchrunArgs( - nnodes = 2, # number of machines - nproc_per_node = 4, # num GPUs per machine - node_rank = 0, # node rank for this machine - rdzv_id = 123, - rdzv_endpoint = RDZV_ENDPOINT -) - -run_training( - torchrun_args=torchrun_args, - training_args=training_args -) -``` - -```python -MASTER_ADDR = os.getenv('MASTER_ADDR') -MASTER_PORT = os.getnev('MASTER_PORT') -RDZV_ENDPOINT = f'{MASTER_ADDR}:{MASTER_PORT}' - -# on machine 2 -torchrun_args = TorchrunArgs( - nnodes = 2, # number of machines - nproc_per_node = 4, # num GPUs per machine - node_rank = 1, # node rank for this machine - rdzv_id = 123, - rdzv_endpoint = f'{MASTER_ADDR}:{MASTER_PORT}' -) - -run_training( - torch_args=torchrun_args, - train_args=training_args -) -``` -If the machine's above have shared storage, users can preprocess the training dataset a single time so that it can then distributed to each machine with the following update: +If the machines above have shared storage, users can preprocess the training dataset a single time so that it can then be distributed to each machine with the following update: ```python from instructlab.training import ( diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py index 499625a5..7b4bf5e3 100644 --- a/src/instructlab/training/__init__.py +++ b/src/instructlab/training/__init__.py @@ -28,9 +28,13 @@ # defer import of main_ds -def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None: +def run_training( + torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True +) -> None: """Wrapper around the main training job that calls torchrun.""" # Local from .main_ds import run_training - return run_training(torch_args=torch_args, train_args=train_args, process_data=process_data) + return run_training( + torch_args=torch_args, train_args=train_args, process_data=process_data + ) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 9b332d36..4bd7c789 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -221,10 +221,8 @@ def get_masked_and_orig_text(sample): def main(args: DataProcessArgs): - if not os.path.exists(args.data_output_path): os.makedirs(args.data_output_path, exist_ok=True) - print("\033[92m data arguments are:\033[0m") print("\033[36m" + args.model_dump_json() + "\033[0m") NUM_PROC = args.num_cpu_procs diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 12d1b6bf..710fcd52 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -635,7 +635,9 @@ def main(args): # public API -def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True) -> None: +def run_training( + torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True +) -> None: """ Wrapper around the main training job that calls torchrun. """ @@ -644,7 +646,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs, process_dat raise ValueError( f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}" ) - + if process_data: dp.main( DataProcessArgs( From f97dca380edb430b20c4b429367edcc8e3dda9ae Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Sat, 5 Oct 2024 14:42:06 -0400 Subject: [PATCH 3/3] move process_data arg into TrainingArgs Signed-off-by: Michael Clifford --- README.md | 24 ++++++++++++++++++++++-- src/instructlab/training/__init__.py | 8 ++------ src/instructlab/training/config.py | 3 +++ src/instructlab/training/main_ds.py | 6 ++---- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 40869a59..645583f3 100644 --- a/README.md +++ b/README.md @@ -283,7 +283,9 @@ run_training( ``` -If the machines above have shared storage, users can preprocess the training dataset a single time so that it can then be distributed to each machine with the following update: +## Example training with separate data pre-processing + +If the machines in the example above have shared storage, users can pre-process the training dataset a single time so that it can then be distributed to each machine by making the following updates. ```python from instructlab.training import ( @@ -295,6 +297,25 @@ from instructlab.training import ( data_process as dp ) +training_args = TrainingArgs( + # define data-specific arguments + model_path = "ibm-granite/granite-7b-base", + data_path = "path/to/dataset.jsonl", + ckpt_output_dir = "data/saved_checkpoints", + data_output_dir = "data/outputs", + + # define model-trianing parameters + max_seq_len = 4096, + max_batch_len = 60000, + num_epochs = 10, + effective_batch_size = 3840, + save_samples = 250000, + learning_rate = 2e-6, + warmup_steps = 800, + is_padding_free = True, # set this to true when using Granite-based models + random_seed = 42, + process_data = True, +) ... data_process_args = DataProcessArgs( @@ -309,6 +330,5 @@ dp.main(data_process_args) run_training( torch_args=torchrun_args, train_args=training_args, - process_data = False ) ``` diff --git a/src/instructlab/training/__init__.py b/src/instructlab/training/__init__.py index 7b4bf5e3..a2ed292a 100644 --- a/src/instructlab/training/__init__.py +++ b/src/instructlab/training/__init__.py @@ -28,13 +28,9 @@ # defer import of main_ds -def run_training( - torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True -) -> None: +def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: """Wrapper around the main training job that calls torchrun.""" # Local from .main_ds import run_training - return run_training( - torch_args=torch_args, train_args=train_args, process_data=process_data - ) + return run_training(torch_args=torch_args, train_args=train_args) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 03e963f2..05fe4792 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -199,3 +199,6 @@ class TrainingArgs(BaseModel): # https://github.com/instructlab/training/issues/28 # quantize_dtype: QuantizeDataType = QuantizeDataType.NONE lora: LoraOptions | None = None + + # This field defines whether or not data processing will occur inside of `run_training()` + process_data: Optional[bool] = True diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 710fcd52..c5cdb2ba 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -635,9 +635,7 @@ def main(args): # public API -def run_training( - torch_args: TorchrunArgs, train_args: TrainingArgs, process_data: bool = True -) -> None: +def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: """ Wrapper around the main training job that calls torchrun. """ @@ -647,7 +645,7 @@ def run_training( f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}" ) - if process_data: + if train_args.process_data: dp.main( DataProcessArgs( # XXX(osilkin): make a decision here, either: