From 52d213173ff844bc2ac5369c22ce35110a2bbe9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Mon, 6 Jan 2025 18:29:09 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=9C=20Use=20field=20in=20dataclasses?= =?UTF-8?q?=20(#2494)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * in hh-rlhf-helpful-base * delete tokenize ds * dataset scripts * alignprop * judge tldr * ddpo * zen * sft video * literal to choices * chat * script args * alignprop * bco * better help format * cpo * ddpo * whether or not -> whether * dpo * dont set the possible values * `Optional[...]` to ... or `None` * xpo * gkd * kto * nash * online dpo * Fix typo in learning rate help message * orpo * more ... or `None` * model config * ppo * prm * reward * rloo * sft * online policy config * make style --- examples/datasets/hh-rlhf-helpful-base.py | 17 +- .../lm-human-preferences-descriptiveness.py | 19 +- .../lm-human-preferences-sentiment.py | 19 +- examples/datasets/math_shepherd.py | 19 +- examples/datasets/prm800k.py | 19 +- examples/datasets/rlaif-v.py | 19 +- examples/datasets/tldr.py | 19 +- examples/datasets/tldr_preference.py | 19 +- examples/datasets/tokenize_ds.py | 54 ---- examples/datasets/ultrafeedback-prompt.py | 19 +- examples/datasets/ultrafeedback.py | 81 +++-- examples/scripts/alignprop.py | 28 +- examples/scripts/ddpo.py | 28 +- examples/scripts/evals/judge_tldr.py | 20 +- examples/scripts/sft_video_llm.py | 12 +- scripts/generate_zen_dataset.py | 17 +- trl/data_utils.py | 6 +- trl/mergekit_utils.py | 8 +- trl/models/modeling_sd_base.py | 4 +- trl/models/utils.py | 2 +- trl/scripts/chat.py | 130 +++++--- trl/scripts/utils.py | 32 +- trl/trainer/alignprop_config.py | 119 +++++--- trl/trainer/bco_config.py | 138 +++++++-- trl/trainer/cpo_config.py | 124 ++++++-- trl/trainer/ddpo_config.py | 216 ++++++++++--- trl/trainer/dpo_config.py | 285 ++++++++++++++---- trl/trainer/gkd_config.py | 64 +++- trl/trainer/judges.py | 2 +- trl/trainer/kto_config.py | 151 ++++++++-- trl/trainer/model_config.py | 114 +++++-- trl/trainer/nash_md_config.py | 9 +- trl/trainer/online_dpo_config.py | 79 ++++- trl/trainer/orpo_config.py | 104 +++++-- trl/trainer/ppo_config.py | 66 +++- trl/trainer/prm_config.py | 47 ++- trl/trainer/reward_config.py | 40 ++- trl/trainer/rloo_config.py | 37 ++- trl/trainer/sft_config.py | 88 ++++-- trl/trainer/utils.py | 147 ++++++--- trl/trainer/xpo_config.py | 11 +- 41 files changed, 1827 insertions(+), 605 deletions(-) delete mode 100644 examples/datasets/tokenize_ds.py diff --git a/examples/datasets/hh-rlhf-helpful-base.py b/examples/datasets/hh-rlhf-helpful-base.py index e089ed108e..2a68daf7ec 100644 --- a/examples/datasets/hh-rlhf-helpful-base.py +++ b/examples/datasets/hh-rlhf-helpful-base.py @@ -13,7 +13,7 @@ # limitations under the License. import re -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -30,13 +30,20 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/hh-rlhf-helpful-base"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/hh-rlhf-helpful-base" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/hh-rlhf-helpful-base", metadata={"help": "Hugging Face repository ID to push the dataset to."} + ) + dataset_num_proc: Optional[int] = field( + default=None, metadata={"help": "Number of workers to use for dataset processing."} + ) def common_start(str1: str, str2: str) -> str: diff --git a/examples/datasets/lm-human-preferences-descriptiveness.py b/examples/datasets/lm-human-preferences-descriptiveness.py index 621757770c..a078b1a0eb 100644 --- a/examples/datasets/lm-human-preferences-descriptiveness.py +++ b/examples/datasets/lm-human-preferences-descriptiveness.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-descriptiveness"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/lm-human-preferences-descriptiveness" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/lm-human-preferences-descriptiveness", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) # Edge cases handling: remove the cases where all samples are the same diff --git a/examples/datasets/lm-human-preferences-sentiment.py b/examples/datasets/lm-human-preferences-sentiment.py index a3eaa4d06e..cbacab91a9 100644 --- a/examples/datasets/lm-human-preferences-sentiment.py +++ b/examples/datasets/lm-human-preferences-sentiment.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-sentiment"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/lm-human-preferences-sentiment" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/lm-human-preferences-sentiment", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_prompt_completion(example, tokenizer): diff --git a/examples/datasets/math_shepherd.py b/examples/datasets/math_shepherd.py index c09e745ad5..214636fcde 100644 --- a/examples/datasets/math_shepherd.py +++ b/examples/datasets/math_shepherd.py @@ -13,7 +13,7 @@ # limitations under the License. import re -from dataclasses import dataclass +from dataclasses import dataclass, field from itertools import chain from typing import Optional @@ -31,13 +31,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/math_shepherd"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/math_shepherd" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/math_shepherd", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def process_example(example): diff --git a/examples/datasets/prm800k.py b/examples/datasets/prm800k.py index b5f95742be..3078ab71ad 100644 --- a/examples/datasets/prm800k.py +++ b/examples/datasets/prm800k.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/prm800k"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/prm800k" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/prm800k", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def process_example(example): diff --git a/examples/datasets/rlaif-v.py b/examples/datasets/rlaif-v.py index 84ae292f87..dfe87d4d83 100644 --- a/examples/datasets/rlaif-v.py +++ b/examples/datasets/rlaif-v.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import features, load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/rlaif-v"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/rlaif-v" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/rlaif-v", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_conversational(example): diff --git a/examples/datasets/tldr.py b/examples/datasets/tldr.py index 0ae29481e3..767385c339 100644 --- a/examples/datasets/tldr.py +++ b/examples/datasets/tldr.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/tldr"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/tldr" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/tldr", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_prompt_completion(example): diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py index 1c4ff5bcbd..aa110af0c6 100644 --- a/examples/datasets/tldr_preference.py +++ b/examples/datasets/tldr_preference.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/tldr-preference"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/tldr-preference" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/tldr-preference", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_preference(example): diff --git a/examples/datasets/tokenize_ds.py b/examples/datasets/tokenize_ds.py deleted file mode 100644 index cd96a685a9..0000000000 --- a/examples/datasets/tokenize_ds.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field -from typing import Optional - -from datasets import load_dataset -from transformers import AutoTokenizer, HfArgumentParser - -from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE - - -""" -python -i examples/datasets/tokenize_ds.py --model HuggingFaceH4/zephyr-7b-beta -python -i examples/datasets/tokenize_ds.py --model gpt2 -""" - - -@dataclass -class ScriptArguments: - dataset_name: str = field( - default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The dataset to load"} - ) - model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"}) - dataset_num_proc: Optional[int] = field( - default=None, metadata={"help": "The number of workers to use to tokenize the data"} - ) - - -if __name__ == "__main__": - script_args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0] - dataset = load_dataset(script_args.dataset_name) - tokenizer = AutoTokenizer.from_pretrained(script_args.model) - if tokenizer.chat_template is None: - tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE - - def process(row): - row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False) - row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False) - return row - - dataset = dataset.map(process, num_proc=script_args.dataset_num_proc) - print(dataset["train"][0]["chosen"]) diff --git a/examples/datasets/ultrafeedback-prompt.py b/examples/datasets/ultrafeedback-prompt.py index 3cb92467d5..7fecadc403 100644 --- a/examples/datasets/ultrafeedback-prompt.py +++ b/examples/datasets/ultrafeedback-prompt.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -29,13 +29,22 @@ class ScriptArguments: Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-prompt"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - push_to_hub: bool = False - repo_id: str = "trl-lib/ultrafeedback-prompt" - dataset_num_proc: Optional[int] = None + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/ultrafeedback-prompt", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_unpaired_preference(example): diff --git a/examples/datasets/ultrafeedback.py b/examples/datasets/ultrafeedback.py index cb6c556d0c..9670bef6d3 100644 --- a/examples/datasets/ultrafeedback.py +++ b/examples/datasets/ultrafeedback.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from datasets import load_dataset @@ -27,46 +27,61 @@ class ScriptArguments: Args: model_name (`str`, *optional*, defaults to `"gpt-3.5-turbo"`): Language model to target. Possible values are: - - - `"alpaca-7b"` - - `"bard"` - - `"falcon-40b-instruct"` - - `"gpt-3.5-turbo"` (default) - - `"gpt-4"` - - `"llama-2-13b-chat"` - - `"llama-2-70b-chat"` - - `"llama-2-7b-chat"` - - `"mpt-30b-chat"` - - `"pythia-12b"` - - `"starchat"` - - `"ultralm-13b"` - - `"ultralm-65b"` - - `"vicuna-33b"` - - `"wizardlm-13b"` - - `"wizardlm-70b"` - - `"wizardlm-7b"` - aspect (`str`, *optional*, defaults to `"helpfulness"`): - Aspect to target. Possible values are: - - - `"helpfulness"` (default) - - `"honesty"` - - `"instruction-following"` - - `"truthfulness"` - + Aspect to target. push_to_hub (`bool`, *optional*, defaults to `False`): Whether to push the dataset to the Hugging Face Hub. repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness"`): Hugging Face repository ID to push the dataset to. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of workers to use for dataset processing. """ - model_name: str = "gpt-3.5-turbo" - aspect: str = "helpfulness" - push_to_hub: bool = False - repo_id: str = "trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness" - dataset_num_proc: Optional[int] = None + model_name: str = field( + default="gpt-3.5-turbo", + metadata={ + "help": "Language model to target.", + "choices": [ + "alpaca-7b", + "bard", + "falcon-40b-instruct", + "gpt-3.5-turbo", + "gpt-4", + "llama-2-13b-chat", + "llama-2-70b-chat", + "llama-2-7b-chat", + "mpt-30b-chat", + "pythia-12b", + "starchat", + "ultralm-13b", + "ultralm-65b", + "vicuna-33b", + "wizardlm-13b", + "wizardlm-70b", + "wizardlm-7b", + ], + }, + ) + aspect: str = field( + default="helpfulness", + metadata={ + "help": "Aspect to target. Possible values are: 'helpfulness' (default), 'honesty', " + "'instruction-following', 'truthfulness'.", + "choices": ["helpfulness", "honesty", "instruction-following", "truthfulness"], + }, + ) + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of workers to use for dataset processing."}, + ) def to_unpaired_preference(example, model_name, aspect): diff --git a/examples/scripts/alignprop.py b/examples/scripts/alignprop.py index 918619c1e8..2203d06e32 100644 --- a/examples/scripts/alignprop.py +++ b/examples/scripts/alignprop.py @@ -37,20 +37,38 @@ @dataclass class ScriptArguments: + r""" + Arguments for the script. + + Args: + pretrained_model (`str`, *optional*, defaults to `"runwayml/stable-diffusion-v1-5"`): + Pretrained model to use. + pretrained_revision (`str`, *optional*, defaults to `"main"`): + Pretrained model revision to use. + hf_hub_model_id (`str`, *optional*, defaults to `"alignprop-finetuned-stable-diffusion"`): + HuggingFace repo to save model weights to. + hf_hub_aesthetic_model_id (`str`, *optional*, defaults to `"trl-lib/ddpo-aesthetic-predictor"`): + Hugging Face model ID for aesthetic scorer model weights. + hf_hub_aesthetic_model_filename (`str`, *optional*, defaults to `"aesthetic-model.pth"`): + Hugging Face model filename for aesthetic scorer model weights. + use_lora (`bool`, *optional*, defaults to `True`): + Whether to use LoRA. + """ + pretrained_model: str = field( - default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"} + default="runwayml/stable-diffusion-v1-5", metadata={"help": "Pretrained model to use."} ) - pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"}) + pretrained_revision: str = field(default="main", metadata={"help": "Pretrained model revision to use."}) hf_hub_model_id: str = field( - default="alignprop-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"} + default="alignprop-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to."} ) hf_hub_aesthetic_model_id: str = field( default="trl-lib/ddpo-aesthetic-predictor", - metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"}, + metadata={"help": "Hugging Face model ID for aesthetic scorer model weights."}, ) hf_hub_aesthetic_model_filename: str = field( default="aesthetic-model.pth", - metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"}, + metadata={"help": "Hugging Face model filename for aesthetic scorer model weights."}, ) use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."}) diff --git a/examples/scripts/ddpo.py b/examples/scripts/ddpo.py index 7919d5244a..07b1fbe84c 100644 --- a/examples/scripts/ddpo.py +++ b/examples/scripts/ddpo.py @@ -41,20 +41,38 @@ @dataclass class ScriptArguments: + r""" + Arguments for the script. + + Args: + pretrained_model (`str`, *optional*, defaults to `"runwayml/stable-diffusion-v1-5"`): + Pretrained model to use. + pretrained_revision (`str`, *optional*, defaults to `"main"`): + Pretrained model revision to use. + hf_hub_model_id (`str`, *optional*, defaults to `"ddpo-finetuned-stable-diffusion"`): + HuggingFace repo to save model weights to. + hf_hub_aesthetic_model_id (`str`, *optional*, defaults to `"trl-lib/ddpo-aesthetic-predictor"`): + Hugging Face model ID for aesthetic scorer model weights. + hf_hub_aesthetic_model_filename (`str`, *optional*, defaults to `"aesthetic-model.pth"`): + Hugging Face model filename for aesthetic scorer model weights. + use_lora (`bool`, *optional*, defaults to `True`): + Whether to use LoRA. + """ + pretrained_model: str = field( - default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"} + default="runwayml/stable-diffusion-v1-5", metadata={"help": "Pretrained model to use."} ) - pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"}) + pretrained_revision: str = field(default="main", metadata={"help": "Pretrained model revision to use."}) hf_hub_model_id: str = field( - default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"} + default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to."} ) hf_hub_aesthetic_model_id: str = field( default="trl-lib/ddpo-aesthetic-predictor", - metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"}, + metadata={"help": "Hugging Face model ID for aesthetic scorer model weights."}, ) hf_hub_aesthetic_model_filename: str = field( default="aesthetic-model.pth", - metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"}, + metadata={"help": "Hugging Face model filename for aesthetic scorer model weights."}, ) use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."}) diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py index f9e51df729..537415d62c 100644 --- a/examples/scripts/evals/judge_tldr.py +++ b/examples/scripts/evals/judge_tldr.py @@ -47,14 +47,28 @@ @dataclass class ScriptArguments: - model_name_or_path: str = field(metadata={"help": "The model name or path to the model to evaluate."}) + r""" + Arguments for the script. + + Args: + model_name_or_path (`str`): + Model name or path to the model to evaluate. + judge_model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`): + Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or + 'meta-llama/Meta-Llama-3-70B-Instruct'. + num_examples (`int` or `None`, *optional*, defaults to `None`): + Number of examples to evaluate. + """ + + model_name_or_path: str = field(metadata={"help": "Model name or path to the model to evaluate."}) judge_model: str = field( default="meta-llama/Meta-Llama-3-70B-Instruct", metadata={ - "help": "The model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125', 'meta-llama/Meta-Llama-3-70B-Instruct'." + "help": "Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or " + "'meta-llama/Meta-Llama-3-70B-Instruct'." }, ) - num_examples: Optional[int] = field(default=None, metadata={"help": "The number of examples to evaluate."}) + num_examples: Optional[int] = field(default=None, metadata={"help": "Number of examples to evaluate."}) # Parse the arguments diff --git a/examples/scripts/sft_video_llm.py b/examples/scripts/sft_video_llm.py index 4a85114d4f..dd64936077 100644 --- a/examples/scripts/sft_video_llm.py +++ b/examples/scripts/sft_video_llm.py @@ -45,7 +45,7 @@ import json import os import random -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any import requests @@ -152,7 +152,15 @@ def collate_fn(examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]: @dataclass class CustomScriptArguments(ScriptArguments): - video_cache_dir: str = "/tmp/videos/" + r""" + Arguments for the script. + + Args: + video_cache_dir (`str`, *optional*, defaults to `"/tmp/videos/"`): + Video cache directory. + """ + + video_cache_dir: str = field(default="/tmp/videos/", metadata={"help": "Video cache directory."}) if __name__ == "__main__": diff --git a/scripts/generate_zen_dataset.py b/scripts/generate_zen_dataset.py index 73c7c16f82..4bfc4e23f6 100644 --- a/scripts/generate_zen_dataset.py +++ b/scripts/generate_zen_dataset.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from datasets import Dataset from transformers import HfArgumentParser @@ -32,9 +32,18 @@ class ScriptArguments: Hugging Face repository ID to push the dataset to. """ - test_size: float = 0.1 - push_to_hub: bool = False - repo_id: str = "trl-internal-testing/zen" + test_size: float = field( + default=0.1, + metadata={"help": "Fraction of the dataset to include in the test split."}, + ) + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the dataset to the Hugging Face Hub."}, + ) + repo_id: str = field( + default="trl-internal-testing/zen", + metadata={"help": "Hugging Face repository ID to push the dataset to."}, + ) def main(test_size, push_to_hub, repo_id): diff --git a/trl/data_utils.py b/trl/data_utils.py index 8c8f448adf..fc8cd775a5 100644 --- a/trl/data_utils.py +++ b/trl/data_utils.py @@ -173,7 +173,7 @@ def maybe_apply_chat_template( messages, where each message is a dictionary with keys `"role"` and `"content"`. tokenizer (`PreTrainedTokenizer`): The tokenizer to apply the chat template with. - tools (`Optional[list[Union[dict, Callable]]]`, *optional*, defaults to `None`): + tools (`list[Union[dict, Callable]]` or `None`, *optional*, defaults to `None`): A list of tools (callable functions) that will be accessible to the model. If the template does not support function calling, this argument will have no effect @@ -224,7 +224,7 @@ def unpair_preference_dataset( dataset (`Dataset` or `DatasetDict`): Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally `"prompt"`. - num_proc (`Optional[int]`, *optional*, defaults to `None`): + num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. desc (`str` or `None`, *optional*, defaults to `None`): Meaningful description to be displayed alongside with the progress bar while mapping examples. @@ -265,7 +265,7 @@ def maybe_unpair_preference_dataset( dataset (`Dataset` or `DatasetDict`): Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally `"prompt"`. - num_proc (`Optional[int]`, *optional*, defaults to `None`): + num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. desc (`str` or `None`, *optional*, defaults to `None`): Meaningful description to be displayed alongside with the progress bar while mapping examples. diff --git a/trl/mergekit_utils.py b/trl/mergekit_utils.py index 936c42626c..382b91e61a 100644 --- a/trl/mergekit_utils.py +++ b/trl/mergekit_utils.py @@ -59,14 +59,14 @@ class MergeConfig: Attributes: method (`str`): The merge method to use. - policy_model_path (`Optional[str]`): Path to the policy model. - target_model_path (`Optional[str]`): Path to the target model. + policy_model_path (`str` or `None`): Path to the policy model. + target_model_path (`str` or `None`): Path to the target model. policy_model_weight (`float`): Weight for the policy model (for `linear` and `ties` methods). target_model_weight (`float`): Weight for the target model (for `linear` and `ties` methods). policy_model_density (`list[float]`): Density parameters for the policy model (for `ties` and `dare_ties`). target_model_density (`list[float]`): Density parameters for the target model (for `ties` and `dare_ties`). - normalize (`Optional[float]`): Normalization factor for the TIES method. - t_values (`Optional[float]`): Interpolation factor for the SLERP method. + normalize (`float` or `None`): Normalization factor for the TIES method. + t_values (`float` or `None`): Interpolation factor for the SLERP method. dtype (`str`): Data type to use for merging, e.g., `"float16"`. """ diff --git a/trl/models/modeling_sd_base.py b/trl/models/modeling_sd_base.py index 131d8d8016..dbe1cfd8f0 100644 --- a/trl/models/modeling_sd_base.py +++ b/trl/models/modeling_sd_base.py @@ -384,7 +384,7 @@ def pipeline_step( The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be @@ -615,7 +615,7 @@ def pipeline_step_with_grad( The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be diff --git a/trl/models/utils.py b/trl/models/utils.py index 53cf481f1f..3f26146d0e 100644 --- a/trl/models/utils.py +++ b/trl/models/utils.py @@ -90,7 +90,7 @@ def setup_chat_format( model (`~transformers.PreTrainedModel`): The model to be modified. tokenizer (`~transformers.PreTrainedTokenizer`): The tokenizer to be modified. format (`Optional[Literal["chatml"]]`): The format to be set. Defaults to "chatml". - resize_to_multiple_of (`Optional[int]`): Number to resize the embedding layer to. Defaults to None. + resize_to_multiple_of (`int` or `None`): Number to resize the embedding layer to. Defaults to None. Returns: model (`~transformers.PreTrainedModel`): The modified model. diff --git a/trl/scripts/chat.py b/trl/scripts/chat.py index fa9eebc44e..5adc6ea779 100644 --- a/trl/scripts/chat.py +++ b/trl/scripts/chat.py @@ -22,6 +22,7 @@ import time from dataclasses import dataclass, field from threading import Thread +from typing import Optional import torch import yaml @@ -81,67 +82,118 @@ @dataclass class ChatArguments: - # general settings - model_name_or_path: str = field(metadata={"help": "Name of the pre-trained model"}) - user: str = field(default=None, metadata={"help": "Username to display in chat interface"}) - system_prompt: str = field(default=None, metadata={"help": "System prompt"}) - save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history"}) - device: str = field( - default="cpu", - metadata={"help": "device to use for inference."}, - ) - examples_path: str = field(default=None, metadata={"help": "Path to a yaml file with examples"}) - # generation settings - max_new_tokens: int = field(default=256, metadata={"help": "Maximum number of tokens to generate"}) - do_sample: bool = field(default=True, metadata={"help": "Whether to sample outputs during generation"}) - num_beams: int = field(default=1, metadata={"help": "Number of beams for beam search"}) - temperature: float = field(default=1.0, metadata={"help": "Temperature parameter for generation"}) - top_k: int = field(default=50, metadata={"help": "Value of k for top-k sampling"}) - top_p: float = field(default=1.0, metadata={"help": "Value of p for nucleus sampling"}) - repetition_penalty: float = field(default=1.0, metadata={"help": "Repetition penalty"}) - eos_tokens: str = field( + r""" + Arguments for the chat script. + + Args: + model_name_or_path (`str`): + Name of the pre-trained model. + user (`str` or `None`, *optional*, defaults to `None`): + Username to display in chat interface. + system_prompt (`str` or `None`, *optional*, defaults to `None`): + System prompt. + save_folder (`str`, *optional*, defaults to `"./chat_history/"`): + Folder to save chat history. + device (`str`, *optional*, defaults to `"cpu"`): + Device to use for inference. + examples_path (`str` or `None`, *optional*, defaults to `None`): + Path to a yaml file with examples. + max_new_tokens (`int`, *optional*, defaults to `256`): + Maximum number of tokens to generate. + do_sample (`bool`, *optional*, defaults to `True`): + Whether to sample outputs during generation. + num_beams (`int`, *optional*, defaults to `1`): + Number of beams for beam search. + temperature (`float`, *optional*, defaults to `1.0`): + Temperature parameter for generation. + top_k (`int`, *optional*, defaults to `50`): + Value of k for top-k sampling. + top_p (`float`, *optional*, defaults to `1.0`): + Value of p for nucleus sampling. + repetition_penalty (`float`, *optional*, defaults to `1.0`): + Repetition penalty. + eos_tokens (`str` or `None`, *optional*, defaults to `None`): + EOS tokens to stop the generation. If multiple they should be comma separated. + eos_token_ids (`str` or `None`, *optional*, defaults to `None`): + EOS token IDs to stop the generation. If multiple they should be comma separated. + model_revision (`str`, *optional*, defaults to `"main"`): + Specific model version to use (can be a branch name, tag name or commit id). + torch_dtype (`str` or `None`, *optional*, defaults to `None`): + Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype + will be automatically derived from the model's weights. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether to trust remote code when loading a model. + attn_implementation (`str` or `None`, *optional*, defaults to `None`): + Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case + you must install this manually by running `pip install flash-attn --no-build-isolation`. + load_in_8bit (`bool`, *optional*, defaults to `False`): + Whether to use 8 bit precision for the base model - works only with LoRA. + load_in_4bit (`bool`, *optional*, defaults to `False`): + Whether to use 4 bit precision for the base model - works only with LoRA. + bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`): + Quantization type. + use_bnb_nested_quant (`bool`, *optional*, defaults to `False`): + Whether to use nested quantization. + """ + + # General settings + model_name_or_path: str = field(metadata={"help": "Name of the pre-trained model."}) + user: Optional[str] = field(default=None, metadata={"help": "Username to display in chat interface."}) + system_prompt: Optional[str] = field(default=None, metadata={"help": "System prompt."}) + save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history."}) + device: str = field(default="cpu", metadata={"help": "Device to use for inference."}) + examples_path: Optional[str] = field(default=None, metadata={"help": "Path to a yaml file with examples."}) + + # Generation settings + max_new_tokens: int = field(default=256, metadata={"help": "Maximum number of tokens to generate."}) + do_sample: bool = field(default=True, metadata={"help": "Whether to sample outputs during generation."}) + num_beams: int = field(default=1, metadata={"help": "Number of beams for beam search."}) + temperature: float = field(default=1.0, metadata={"help": "Temperature parameter for generation."}) + top_k: int = field(default=50, metadata={"help": "Value of k for top-k sampling."}) + top_p: float = field(default=1.0, metadata={"help": "Value of p for nucleus sampling."}) + repetition_penalty: float = field(default=1.0, metadata={"help": "Repetition penalty."}) + eos_tokens: Optional[str] = field( default=None, - metadata={"help": "EOS tokens to stop the generation. If multiple they should be comma separated"}, + metadata={"help": "EOS tokens to stop the generation. If multiple they should be comma separated."}, ) - eos_token_ids: str = field( + eos_token_ids: Optional[str] = field( default=None, - metadata={"help": "EOS token IDs to stop the generation. If multiple they should be comma separated"}, + metadata={"help": "EOS token IDs to stop the generation. If multiple they should be comma separated."}, ) - # model loading + + # Model loading model_revision: str = field( default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + metadata={"help": "Specific model version to use (can be a branch name, tag name or commit id)."}, ) - torch_dtype: str = field( + torch_dtype: Optional[str] = field( default=None, metadata={ - "help": ( - "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " - "dtype will be automatically derived from the model's weights." - ), + "help": "Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, " + "the dtype will be automatically derived from the model's weights.", "choices": ["auto", "bfloat16", "float16", "float32"], }, ) - trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."}) - attn_implementation: str = field( + trust_remote_code: bool = field( + default=False, metadata={"help": "Whether to trust remote code when loading a model."} + ) + attn_implementation: Optional[str] = field( default=None, metadata={ - "help": ( - "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`" - ) + "help": "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in " + "which case you must install this manually by running `pip install flash-attn --no-build-isolation`." }, ) load_in_8bit: bool = field( default=False, - metadata={"help": "use 8 bit precision for the base model - works only with LoRA"}, + metadata={"help": "Whether to use 8 bit precision for the base model - works only with LoRA."}, ) load_in_4bit: bool = field( default=False, - metadata={"help": "use 4 bit precision for the base model - works only with LoRA"}, + metadata={"help": "Whether to use 4 bit precision for the base model - works only with LoRA."}, ) - - bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}) - use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"}) + bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]}) + use_bnb_nested_quant: bool = field(default=False, metadata={"help": "Whether to use nested quantization."}) class RichInterface: diff --git a/trl/scripts/utils.py b/trl/scripts/utils.py index e386a19d37..a381ffe300 100644 --- a/trl/scripts/utils.py +++ b/trl/scripts/utils.py @@ -18,7 +18,7 @@ import os import subprocess import sys -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Iterable, Optional, Union import yaml @@ -44,18 +44,34 @@ class ScriptArguments: dataset_test_split (`str`, *optional*, defaults to `"test"`): Dataset split to use for evaluation. gradient_checkpointing_use_reentrant (`bool`, *optional*, defaults to `False`): - Whether to apply `use_reentrant` for gradient_checkpointing. + Whether to apply `use_reentrant` for gradient checkpointing. ignore_bias_buffers (`bool`, *optional*, defaults to `False`): Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid scalar type, inplace operation. See https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992. """ - dataset_name: str - dataset_config: Optional[str] = None - dataset_train_split: str = "train" - dataset_test_split: str = "test" - gradient_checkpointing_use_reentrant: bool = False - ignore_bias_buffers: bool = False + dataset_name: str = field(metadata={"help": "Dataset name."}) + dataset_config: Optional[str] = field( + default=None, + metadata={ + "help": "Dataset configuration name. Corresponds to the `name` argument of the `datasets.load_dataset` " + "function." + }, + ) + dataset_train_split: str = field(default="train", metadata={"help": "Dataset split to use for training."}) + dataset_test_split: str = field(default="test", metadata={"help": "Dataset split to use for evaluation."}) + gradient_checkpointing_use_reentrant: bool = field( + default=False, + metadata={"help": "Whether to apply `use_reentrant` for gradient checkpointing."}, + ) + ignore_bias_buffers: bool = field( + default=False, + metadata={ + "help": "Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid " + "scalar type, inplace operation. See " + "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992." + }, + ) def init_zero_verbose(): diff --git a/trl/trainer/alignprop_config.py b/trl/trainer/alignprop_config.py index 1c4faa963e..6cfc061d31 100644 --- a/trl/trainer/alignprop_config.py +++ b/trl/trainer/alignprop_config.py @@ -15,7 +15,7 @@ import os import sys from dataclasses import dataclass, field -from typing import Any, Literal, Optional +from typing import Any, Optional from transformers import is_bitsandbytes_available @@ -36,7 +36,9 @@ class AlignPropConfig: Name of this experiment (defaults to the file name without the extension). run_name (`str`, *optional*, defaults to `""`): Name of this run. - log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`): + seed (`int`, *optional*, defaults to `0`): + Random seed for reproducibility. + log_with (`str` or `None`, *optional*, defaults to `None`): Log with either `"wandb"` or `"tensorboard"`. Check [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details. log_image_freq (`int`, *optional*, defaults to `1`): @@ -69,6 +71,8 @@ class AlignPropConfig: Eta parameter for the DDIM sampler. sample_guidance_scale (`float`, *optional*, defaults to `5.0`): Classifier-free guidance weight. + train_batch_size (`int`, *optional*, defaults to `1`): + Batch size for training. train_use_8bit_adam (`bool`, *optional*, defaults to `False`): Whether to use the 8bit Adam optimizer from `bitsandbytes`. train_learning_rate (`float`, *optional*, defaults to `1e-3`): @@ -85,7 +89,7 @@ class AlignPropConfig: Number of gradient accumulation steps. train_max_grad_norm (`float`, *optional*, defaults to `1.0`): Maximum gradient norm for gradient clipping. - negative_prompts (`Optional[str]`, *optional*, defaults to `None`): + negative_prompts (`str` or `None`, *optional*, defaults to `None`): Comma-separated list of prompts to use as negative examples. truncated_backprop_rand (`bool`, *optional*, defaults to `True`): If `True`, randomized truncation to different diffusion timesteps is used. @@ -97,39 +101,82 @@ class AlignPropConfig: Whether to push the final model to the Hub. """ - exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] - run_name: str = "" - seed: int = 0 - log_with: Optional[Literal["wandb", "tensorboard"]] = None - log_image_freq: int = 1 - tracker_kwargs: dict[str, Any] = field(default_factory=dict) - accelerator_kwargs: dict[str, Any] = field(default_factory=dict) - project_kwargs: dict[str, Any] = field(default_factory=dict) - tracker_project_name: str = "trl" - logdir: str = "logs" - num_epochs: int = 100 - save_freq: int = 1 - num_checkpoint_limit: int = 5 - mixed_precision: str = "fp16" - allow_tf32: bool = True - resume_from: str = "" - sample_num_steps: int = 50 - sample_eta: float = 1.0 - sample_guidance_scale: float = 5.0 - train_batch_size: int = 1 - train_use_8bit_adam: bool = False - train_learning_rate: float = 1e-3 - train_adam_beta1: float = 0.9 - train_adam_beta2: float = 0.999 - train_adam_weight_decay: float = 1e-4 - train_adam_epsilon: float = 1e-8 - train_gradient_accumulation_steps: int = 1 - train_max_grad_norm: float = 1.0 - negative_prompts: Optional[str] = None - truncated_backprop_rand: bool = True - truncated_backprop_timestep: int = 49 - truncated_rand_backprop_minmax: tuple[int, int] = (0, 50) - push_to_hub: bool = False + exp_name: str = field( + default=os.path.basename(sys.argv[0])[: -len(".py")], + metadata={"help": "Name of this experiment (defaults to the file name without the extension)."}, + ) + run_name: str = field(default="", metadata={"help": "Name of this run."}) + seed: int = field(default=0, metadata={"help": "Random seed for reproducibility."}) + log_with: Optional[str] = field( + default=None, + metadata={"help": "Log with either 'wandb' or 'tensorboard'.", "choices": ["wandb", "tensorboard"]}, + ) + log_image_freq: int = field(default=1, metadata={"help": "Frequency for logging images."}) + tracker_kwargs: dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Keyword arguments for the tracker (e.g., `wandb_project`)."}, + ) + accelerator_kwargs: dict[str, Any] = field( + default_factory=dict, metadata={"help": "Keyword arguments for the accelerator."} + ) + project_kwargs: dict[str, Any] = field( + default_factory=dict, + metadata={"help": "Keyword arguments for the accelerator project config (e.g., `logging_dir`)."}, + ) + tracker_project_name: str = field(default="trl", metadata={"help": "Name of project to use for tracking."}) + logdir: str = field(default="logs", metadata={"help": "Top-level logging directory for checkpoint saving."}) + num_epochs: int = field(default=100, metadata={"help": "Number of epochs to train."}) + save_freq: int = field(default=1, metadata={"help": "Number of epochs between saving model checkpoints."}) + num_checkpoint_limit: int = field( + default=5, metadata={"help": "Number of checkpoints to keep before overwriting old ones."} + ) + mixed_precision: str = field( + default="fp16", + metadata={ + "help": "Mixed precision training. Possible values are 'fp16', 'bf16', 'none'.", + "choices": ["fp16", "bf16", "none"], + }, + ) + allow_tf32: bool = field(default=True, metadata={"help": "Allow `tf32` on Ampere GPUs."}) + resume_from: str = field(default="", metadata={"help": "Path to resume training from a checkpoint."}) + sample_num_steps: int = field(default=50, metadata={"help": "Number of sampler inference steps."}) + sample_eta: float = field(default=1.0, metadata={"help": "Eta parameter for the DDIM sampler."}) + sample_guidance_scale: float = field(default=5.0, metadata={"help": "Classifier-free guidance weight."}) + train_batch_size: int = field(default=1, metadata={"help": "Batch size for training."}) + train_use_8bit_adam: bool = field( + default=False, metadata={"help": "Whether to use the 8bit Adam optimizer from `bitsandbytes`."} + ) + train_learning_rate: float = field(default=1e-3, metadata={"help": "Learning rate."}) + train_adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer."}) + train_adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer."}) + train_adam_weight_decay: float = field(default=1e-4, metadata={"help": "Weight decay for Adam optimizer."}) + train_adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon value for Adam optimizer."}) + train_gradient_accumulation_steps: int = field( + default=1, metadata={"help": "Number of gradient accumulation steps."} + ) + train_max_grad_norm: float = field(default=1.0, metadata={"help": "Maximum gradient norm for gradient clipping."}) + negative_prompts: Optional[str] = field( + default=None, + metadata={"help": "Comma-separated list of prompts to use as negative examples."}, + ) + truncated_backprop_rand: bool = field( + default=True, + metadata={"help": "If `True`, randomized truncation to different diffusion timesteps is used."}, + ) + truncated_backprop_timestep: int = field( + default=49, + metadata={ + "help": "Absolute timestep to which the gradients are backpropagated. Used only if " + "`truncated_backprop_rand=False`." + }, + ) + truncated_rand_backprop_minmax: tuple[int, int] = field( + default=(0, 50), + metadata={ + "help": "Range of diffusion timesteps for randomized truncated backpropagation.", + }, + ) + push_to_hub: bool = field(default=False, metadata={"help": "Whether to push the final model to the Hub."}) def to_dict(self): output_dict = {} diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py index 5163080112..b6a5db07b0 100644 --- a/trl/trainer/bco_config.py +++ b/trl/trainer/bco_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Optional from transformers import TrainingArguments @@ -28,12 +28,12 @@ class BCOConfig(TrainingArguments): command line. Parameters: - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + max_prompt_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the completion. This argument is required if you want to use the default data collator and your model is an encoder-decoder. beta (`float`, *optional*, defaults to `0.1`): @@ -41,7 +41,7 @@ class BCOConfig(TrainingArguments): reference model. label_pad_token_id (`int`, *optional*, defaults to `-100`): Label pad token id. This argument is required if you want to use the default data collator. - padding_value (`Optional[int]`, *optional*, defaults to `None`): + padding_value (`int` or `None`, *optional*, defaults to `None`): Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`, *optional*, defaults to `"keep_end"`): Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. @@ -51,19 +51,19 @@ class BCOConfig(TrainingArguments): generate_during_eval (`bool`, *optional*, defaults to `False`): If `True`, generates and logs completions from both the model and the reference model to W&B or Comet during evaluation. - is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, you need to specify if the model returned by the callable is an encoder-decoder model. precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): Whether to precompute reference model log probabilities for training and evaluation datasets. This is useful when training without the reference model to reduce the total GPU memory needed. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model from a string. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. prompt_sample_size (`int`, *optional*, defaults to `1024`): Number of prompts that are fed to density ratio classifier. @@ -73,20 +73,106 @@ class BCOConfig(TrainingArguments): Maximum value of the density ratio. The estimated density ratio is clamped to this value. """ - max_length: Optional[int] = None - max_prompt_length: Optional[int] = None - max_completion_length: Optional[int] = None - beta: float = 0.1 - label_pad_token_id: int = -100 - padding_value: Optional[int] = None - truncation_mode: str = "keep_end" - disable_dropout: bool = True - generate_during_eval: bool = False - is_encoder_decoder: Optional[bool] = None - precompute_ref_log_probs: bool = False - model_init_kwargs: Optional[dict[str, Any]] = None - ref_model_init_kwargs: Optional[dict[str, Any]] = None - dataset_num_proc: Optional[int] = None - prompt_sample_size: int = 1024 - min_density_ratio: float = 0.5 - max_density_ratio: float = 10.0 + max_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the sequences (prompt + completion) in the batch. " + "This argument is required if you want to use the default data collator." + }, + ) + max_prompt_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the prompt. " + "This argument is required if you want to use the default data collator." + }, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion. This argument is required if you want to use the " + "default data collator and your model is an encoder-decoder." + }, + ) + beta: float = field( + default=0.1, + metadata={ + "help": "Parameter controlling the deviation from the reference model. " + "Higher β means less deviation from the reference model." + }, + ) + label_pad_token_id: int = field( + default=-100, + metadata={ + "help": "Label pad token id. This argument is required if you want to use the default data collator." + }, + ) + padding_value: Optional[int] = field( + default=None, + metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."}, + ) + truncation_mode: str = field( + default="keep_end", + metadata={ + "help": "Truncation mode to use when the prompt is too long. Possible values are " + "`keep_end` or `keep_start`. This argument is required if you want to use the " + "default data collator." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model and reference model."}, + ) + generate_during_eval: bool = field( + default=False, + metadata={ + "help": "If `True`, generates and logs completions from both the model and the reference model " + "to W&B during evaluation." + }, + ) + is_encoder_decoder: Optional[bool] = field( + default=None, + metadata={ + "help": "When using the `model_init` argument (callable) to instantiate the model instead of the " + "`model` argument, you need to specify if the model returned by the callable is an " + "encoder-decoder model." + }, + ) + precompute_ref_log_probs: bool = field( + default=False, + metadata={ + "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. " + "This is useful when training without the reference model to reduce the total GPU memory " + "needed." + }, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "model from a string." + }, + ) + ref_model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "reference model from a string." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) + prompt_sample_size: int = field( + default=1024, + metadata={"help": "Number of prompts that are fed to density ratio classifier."}, + ) + min_density_ratio: float = field( + default=0.5, + metadata={"help": "Minimum value of the density ratio. The estimated density ratio is clamped to this value."}, + ) + max_density_ratio: float = field( + default=10.0, + metadata={"help": "Maximum value of the density ratio. The estimated density ratio is clamped to this value."}, + ) diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py index a451d7c09c..65b1187466 100644 --- a/trl/trainer/cpo_config.py +++ b/trl/trainer/cpo_config.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Any, Literal, Optional +from dataclasses import dataclass, field +from typing import Any, Optional from transformers import TrainingArguments @@ -31,12 +31,12 @@ class CPOConfig(TrainingArguments): learning_rate (`float`, *optional*, defaults to `1e-6`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + max_prompt_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the completion. This argument is required if you want to use the default data collator and your model is an encoder-decoder. beta (`float`, *optional*, defaults to `0.1`): @@ -61,37 +61,109 @@ class CPOConfig(TrainingArguments): Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`. label_pad_token_id (`int`, *optional*, defaults to `-100`): Label pad token id. This argument is required if you want to use the default data collator. - padding_value (`Optional[int]`, *optional*, defaults to `None`): + padding_value (`int` or `None`, *optional*, defaults to `None`): Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`,*optional*, defaults to `"keep_end"`): Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. This argument is required if you want to use the default data collator. generate_during_eval (`bool`, *optional*, defaults to `False`): If `True`, generates and logs completions from the model to W&B or Comet during evaluation. - is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, you need to specify if the model returned by the callable is an encoder-decoder model. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. """ - learning_rate: float = 1e-6 - max_length: Optional[int] = None - max_prompt_length: Optional[int] = None - max_completion_length: Optional[int] = None - beta: float = 0.1 - label_smoothing: float = 0.0 - loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid" - disable_dropout: bool = True - cpo_alpha: float = 1.0 - simpo_gamma: float = 0.5 - label_pad_token_id: int = -100 - padding_value: Optional[int] = None - truncation_mode: str = "keep_end" - generate_during_eval: bool = False - is_encoder_decoder: Optional[bool] = None - model_init_kwargs: Optional[dict[str, Any]] = None - dataset_num_proc: Optional[int] = None + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`transformers.TrainingArguments`." + }, + ) + max_length: Optional[int] = field( + default=None, + metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."}, + ) + max_prompt_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the prompt. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + beta: float = field( + default=0.1, + metadata={ + "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from " + "the reference model." + }, + ) + label_smoothing: float = field( + default=0.0, + metadata={"help": "Label smoothing factor."}, + ) + loss_type: str = field( + default="sigmoid", + metadata={ + "help": "Type of loss to use.", + "choices": ["sigmoid", "hinge", "ipo", "simpo"], + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model."}, + ) + cpo_alpha: float = field( + default=1.0, + metadata={"help": "Weight of the BC regularizer in CPO training."}, + ) + simpo_gamma: float = field( + default=0.5, + metadata={"help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."}, + ) + label_pad_token_id: int = field( + default=-100, + metadata={"help": "Label pad token id."}, + ) + padding_value: Optional[int] = field( + default=None, + metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."}, + ) + truncation_mode: str = field( + default="keep_end", + metadata={ + "help": "Truncation mode to use when the prompt is too long.", + "choices": ["keep_end", "keep_start"], + }, + ) + generate_during_eval: bool = field( + default=False, + metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."}, + ) + is_encoder_decoder: Optional[bool] = field( + default=None, + metadata={"help": "Whether the model is an encoder-decoder model."}, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model " + "from a string." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) diff --git a/trl/trainer/ddpo_config.py b/trl/trainer/ddpo_config.py index ca703eb806..06ba94a48e 100644 --- a/trl/trainer/ddpo_config.py +++ b/trl/trainer/ddpo_config.py @@ -15,7 +15,7 @@ import os import sys from dataclasses import dataclass, field -from typing import Literal, Optional +from typing import Optional from transformers import is_bitsandbytes_available @@ -38,7 +38,7 @@ class DDPOConfig: Name of this run. seed (`int`, *optional*, defaults to `0`): Random seed. - log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`): + log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`): Log with either 'wandb' or 'tensorboard', check https://huggingface.co/docs/accelerate/usage_guides/tracking for more details. tracker_kwargs (`Dict`, *optional*, defaults to `{}`): @@ -94,7 +94,7 @@ class DDPOConfig: train_num_inner_epochs (`int`, *optional*, defaults to `1`): Number of inner epochs per outer epoch. train_cfg (`bool`, *optional*, defaults to `True`): - Whether or not to use classifier-free guidance during training. + Whether to use classifier-free guidance during training. train_adv_clip_max (`float`, *optional*, defaults to `5.0`): Clip advantages to the range. train_clip_range (`float`, *optional*, defaults to `1e-4`): @@ -111,53 +111,179 @@ class DDPOConfig: Whether to compute rewards asynchronously. max_workers (`int`, *optional*, defaults to `2`): Maximum number of workers to use for async reward computation. - negative_prompts (`Optional[str]`, *optional*, defaults to `""`): + negative_prompts (`str`, *optional*, defaults to `""`): Comma-separated list of prompts to use as negative examples. push_to_hub (`bool`, *optional*, defaults to `False`): Whether to push the final model checkpoint to the Hub. """ - exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] - run_name: str = "" - seed: int = 0 - log_with: Optional[Literal["wandb", "tensorboard"]] = None - tracker_kwargs: dict = field(default_factory=dict) - accelerator_kwargs: dict = field(default_factory=dict) - project_kwargs: dict = field(default_factory=dict) - tracker_project_name: str = "trl" - logdir: str = "logs" - num_epochs: int = 100 - save_freq: int = 1 - num_checkpoint_limit: int = 5 - mixed_precision: str = "fp16" - allow_tf32: bool = True - resume_from: str = "" - sample_num_steps: int = 50 - sample_eta: float = 1.0 - sample_guidance_scale: float = 5.0 - sample_batch_size: int = 1 - sample_num_batches_per_epoch: int = 2 - train_batch_size: int = 1 - train_use_8bit_adam: bool = False - train_learning_rate: float = 3e-4 - train_adam_beta1: float = 0.9 - train_adam_beta2: float = 0.999 - train_adam_weight_decay: float = 1e-4 - train_adam_epsilon: float = 1e-8 - train_gradient_accumulation_steps: int = 1 - train_max_grad_norm: float = 1.0 - train_num_inner_epochs: int = 1 - train_cfg: bool = True - train_adv_clip_max: float = 5.0 - train_clip_range: float = 1e-4 - train_timestep_fraction: float = 1.0 - per_prompt_stat_tracking: bool = False - per_prompt_stat_tracking_buffer_size: int = 16 - per_prompt_stat_tracking_min_count: int = 16 - async_reward_computation: bool = False - max_workers: int = 2 - negative_prompts: str = "" - push_to_hub: bool = False + exp_name: str = field( + default=os.path.basename(sys.argv[0])[: -len(".py")], + metadata={"help": "Name of this experiment (by default is the file name without the extension name)."}, + ) + run_name: str = field( + default="", + metadata={"help": "Name of this run."}, + ) + seed: int = field( + default=0, + metadata={"help": "Random seed."}, + ) + log_with: Optional[str] = field( + default=None, + metadata={ + "help": "Log with either 'wandb' or 'tensorboard'.", + "choices": ["wandb", "tensorboard"], + }, + ) + tracker_kwargs: dict = field( + default_factory=dict, + metadata={"help": "Keyword arguments for the tracker (e.g. wandb_project)."}, + ) + accelerator_kwargs: dict = field( + default_factory=dict, + metadata={"help": "Keyword arguments for the accelerator."}, + ) + project_kwargs: dict = field( + default_factory=dict, + metadata={"help": "Keyword arguments for the accelerator project config (e.g. `logging_dir`)."}, + ) + tracker_project_name: str = field( + default="trl", + metadata={"help": "Name of project to use for tracking."}, + ) + logdir: str = field( + default="logs", + metadata={"help": "Top-level logging directory for checkpoint saving."}, + ) + num_epochs: int = field( + default=100, + metadata={"help": "Number of epochs to train."}, + ) + save_freq: int = field( + default=1, + metadata={"help": "Number of epochs between saving model checkpoints."}, + ) + num_checkpoint_limit: int = field( + default=5, + metadata={"help": "Number of checkpoints to keep before overwriting old ones."}, + ) + mixed_precision: str = field( + default="fp16", + metadata={"help": "Mixed precision training."}, + ) + allow_tf32: bool = field( + default=True, + metadata={"help": "Allow `tf32` on Ampere GPUs."}, + ) + resume_from: str = field( + default="", + metadata={"help": "Resume training from a checkpoint."}, + ) + sample_num_steps: int = field( + default=50, + metadata={"help": "Number of sampler inference steps."}, + ) + sample_eta: float = field( + default=1.0, + metadata={"help": "Eta parameter for the DDIM sampler."}, + ) + sample_guidance_scale: float = field( + default=5.0, + metadata={"help": "Classifier-free guidance weight."}, + ) + sample_batch_size: int = field( + default=1, + metadata={"help": "Batch size (per GPU) to use for sampling."}, + ) + sample_num_batches_per_epoch: int = field( + default=2, + metadata={"help": "Number of batches to sample per epoch."}, + ) + train_batch_size: int = field( + default=1, + metadata={"help": "Batch size (per GPU) to use for training."}, + ) + train_use_8bit_adam: bool = field( + default=False, + metadata={"help": "Use 8bit Adam optimizer from bitsandbytes."}, + ) + train_learning_rate: float = field( + default=3e-4, + metadata={"help": "Learning rate."}, + ) + train_adam_beta1: float = field( + default=0.9, + metadata={"help": "Adam beta1."}, + ) + train_adam_beta2: float = field( + default=0.999, + metadata={"help": "Adam beta2."}, + ) + train_adam_weight_decay: float = field( + default=1e-4, + metadata={"help": "Adam weight decay."}, + ) + train_adam_epsilon: float = field( + default=1e-8, + metadata={"help": "Adam epsilon."}, + ) + train_gradient_accumulation_steps: int = field( + default=1, + metadata={"help": "Number of gradient accumulation steps."}, + ) + train_max_grad_norm: float = field( + default=1.0, + metadata={"help": "Maximum gradient norm for gradient clipping."}, + ) + train_num_inner_epochs: int = field( + default=1, + metadata={"help": "Number of inner epochs per outer epoch."}, + ) + train_cfg: bool = field( + default=True, + metadata={"help": "Whether to use classifier-free guidance during training."}, + ) + train_adv_clip_max: float = field( + default=5.0, + metadata={"help": "Clip advantages to the range."}, + ) + train_clip_range: float = field( + default=1e-4, + metadata={"help": "PPO clip range."}, + ) + train_timestep_fraction: float = field( + default=1.0, + metadata={"help": "Fraction of timesteps to train on."}, + ) + per_prompt_stat_tracking: bool = field( + default=False, + metadata={"help": "Whether to track statistics for each prompt separately."}, + ) + per_prompt_stat_tracking_buffer_size: int = field( + default=16, + metadata={"help": "Number of reward values to store in the buffer for each prompt."}, + ) + per_prompt_stat_tracking_min_count: int = field( + default=16, + metadata={"help": "Minimum number of reward values to store in the buffer."}, + ) + async_reward_computation: bool = field( + default=False, + metadata={"help": "Whether to compute rewards asynchronously."}, + ) + max_workers: int = field( + default=2, + metadata={"help": "Maximum number of workers to use for async reward computation."}, + ) + negative_prompts: str = field( + default="", + metadata={"help": "Comma-separated list of prompts to use as negative examples."}, + ) + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the final model checkpoint to the Hub."}, + ) def to_dict(self): output_dict = {} diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py index 966ea7079c..9c98f9e910 100644 --- a/trl/trainer/dpo_config.py +++ b/trl/trainer/dpo_config.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum -from typing import Any, Literal, Optional +from typing import Any, Optional from transformers import TrainingArguments @@ -67,20 +67,20 @@ class DPOConfig(TrainingArguments): - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper. - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper. use_weighting (`bool`, *optional*, defaults to `False`): - Whether or not to weight the loss as done in the [WPO](https://huggingface.co/papers/2406.11827) paper. + Whether to weight the loss as done in the [WPO](https://huggingface.co/papers/2406.11827) paper. label_pad_token_id (`int`, *optional*, defaults to `-100`): Label pad token id. This argument is required if you want to use the default data collator. - padding_value (`Optional[int]`, *optional*, defaults to `None`): + padding_value (`int` or `None`, *optional*, defaults to `None`): Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`, *optional*, defaults to `"keep_end"`): Truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + max_prompt_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. is_encoder_decoder(`Optional[int]`, *optional*, defaults to `None`): @@ -94,21 +94,21 @@ class DPOConfig(TrainingArguments): precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): Whether to precompute reference model log probabilities for training and evaluation datasets. This is useful when training without the reference model to reduce the total GPU memory needed. - precompute_ref_batch_size (`Optional[int]`, *optional*, defaults to `None`): + precompute_ref_batch_size (`int` or `None`, *optional*, defaults to `None`): Batch size to use when precomputing reference model log probabilities. This can be set higher than the training batch size to speed up preprocessing. If `None`, defaults to `per_device_train_batch_size` for training and `per_device_eval_batch_size` for evaluation. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model from a string. - model_adapter_name (`Optional[str]`, *optional*, defaults to `None`): + model_adapter_name (`str` or `None`, *optional*, defaults to `None`): Name of the train target PEFT adapter, when using LoRA with multiple adapters. - ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`): + ref_adapter_name (`str` or `None`, *optional*, defaults to `None`): Name of the reference PEFT adapter, when using LoRA with multiple adapters. reference_free (`bool`, *optional*, defaults to `False`): If `True`, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal @@ -141,54 +141,217 @@ class DPOConfig(TrainingArguments): τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`. use_num_logits_to_keep (`bool`, *optional*, defaults to `False`): - If `True`, only a specified number of logits are computed in the forward pass of CausalLM. This can be useful - for saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios - when working with very long prompts where labels are -ignored (-100). + If `True`, only a specified number of logits are computed in the forward pass of CausalLM. This can be + useful for saving memory and speeding up training by not computing the logits for all tokens, especially in + scenarios when working with very long prompts where labels are ignored (-100). [Read more](https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM) """ - learning_rate: float = 1e-6 - beta: float = 0.1 - label_smoothing: float = 0.0 - loss_type: Literal[ - "sigmoid", - "hinge", - "ipo", - "exo_pair", - "nca_pair", - "robust", - "bco_pair", - "sppo_hard", - "aot", - "aot_pair", - "discopop", - "apo_zero", - "apo_down", - ] = "sigmoid" - use_weighting: bool = False - label_pad_token_id: int = -100 - padding_value: Optional[int] = None - truncation_mode: str = "keep_end" - max_length: Optional[int] = None - max_prompt_length: Optional[int] = None - max_completion_length: Optional[int] = None - is_encoder_decoder: Optional[bool] = None - disable_dropout: bool = True - generate_during_eval: bool = False - precompute_ref_log_probs: bool = False - precompute_ref_batch_size: Optional[int] = None - dataset_num_proc: Optional[int] = None - model_init_kwargs: Optional[dict[str, Any]] = None - ref_model_init_kwargs: Optional[dict[str, Any]] = None - model_adapter_name: Optional[str] = None - ref_adapter_name: Optional[str] = None - reference_free: bool = False - force_use_ref_model: bool = False - f_divergence_type: FDivergenceType = FDivergenceType.REVERSE_KL - f_alpha_divergence_coef: float = 1.0 - sync_ref_model: bool = False - ref_model_mixup_alpha: float = 0.9 - ref_model_sync_steps: int = 64 - rpo_alpha: Optional[float] = None - discopop_tau: float = 0.05 - use_num_logits_to_keep: bool = False + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`transformers.TrainingArguments`." + }, + ) + beta: float = field( + default=0.1, + metadata={ + "help": "Parameter controlling the deviation from the reference model. " + "Higher β means less deviation from the reference model." + }, + ) + label_smoothing: float = field( + default=0.0, + metadata={"help": "Label smoothing factor."}, + ) + loss_type: str = field( + default="sigmoid", + metadata={ + "help": "Type of loss to use.", + "choices": [ + "sigmoid", + "hinge", + "ipo", + "exo_pair", + "nca_pair", + "robust", + "bco_pair", + "sppo_hard", + "aot", + "aot_pair", + "discopop", + "apo_zero", + "apo_down", + ], + }, + ) + use_weighting: bool = field( + default=False, + metadata={"help": "Whether to weight the loss as done in the WPO paper."}, + ) + label_pad_token_id: int = field( + default=-100, + metadata={ + "help": "Label pad token id. This argument is required if you want to use the default data collator." + }, + ) + padding_value: Optional[int] = field( + default=None, + metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."}, + ) + truncation_mode: str = field( + default="keep_end", + metadata={ + "help": "Truncation mode to use when the prompt is too long. This argument is required if you want to use " + "the default data collator.", + "choices": ["keep_end", "keep_start"], + }, + ) + max_length: Optional[int] = field( + default=None, + metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."}, + ) + max_prompt_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the prompt. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + is_encoder_decoder: Optional[bool] = field( + default=None, + metadata={ + "help": "When using the `model_init` argument (callable) to instantiate the model instead of the " + "`model` argument, you need to specify if the model returned by the callable is an encoder-decoder model." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model and reference model."}, + ) + generate_during_eval: bool = field( + default=False, + metadata={ + "help": "If `True`, generates and logs completions from both the model and the reference model " + "to W&B during evaluation." + }, + ) + precompute_ref_log_probs: bool = field( + default=False, + metadata={ + "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. " + "This is useful when training without the reference model to reduce the total GPU memory needed." + }, + ) + precompute_ref_batch_size: Optional[int] = field( + default=None, + metadata={ + "help": "Batch size to use when precomputing reference model log probabilities. This can be set higher " + "than the training batch size to speed up preprocessing. If `None`, defaults to " + "`per_device_train_batch_size` for training and `per_device_eval_batch_size` for evaluation." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "model from a string." + }, + ) + ref_model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "reference model from a string." + }, + ) + model_adapter_name: Optional[str] = field( + default=None, + metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."}, + ) + ref_adapter_name: Optional[str] = field( + default=None, + metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."}, + ) + reference_free: bool = field( + default=False, + metadata={ + "help": "If `True`, we ignore the _provided_ reference model and implicitly use a reference model that " + "assigns equal probability to all responses." + }, + ) + force_use_ref_model: bool = field( + default=False, + metadata={ + "help": "In case one passes a PEFT model for the active model and you want to use a different model for " + "the ref_model, set this flag to `True`." + }, + ) + f_divergence_type: FDivergenceType = field( + default=FDivergenceType.REVERSE_KL, + metadata={ + "help": "Type of f-divergence regularization function to compute divergence between policy and reference " + "model." + }, + ) + f_alpha_divergence_coef: float = field( + default=1.0, + metadata={"help": "α coefficient in the α-divergence u^-α regularization function for DPO loss."}, + ) + sync_ref_model: bool = field( + default=False, + metadata={ + "help": "When set to `True`, the reference model is synchronized with the active model every " + "`ref_model_sync_steps` steps, using the `ref_model_mixup_alpha` parameter." + }, + ) + ref_model_mixup_alpha: float = field( + default=0.9, + metadata={ + "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the " + "previous reference policy during updates. The reference policy is updated according to the equation: " + "`π_ref = α * π_θ + (1 - α) * π_ref_prev`" + }, + ) + ref_model_sync_steps: int = field( + default=64, + metadata={ + "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is " + "synchronized with the reference policy." + }, + ) + rpo_alpha: Optional[float] = field( + default=None, + metadata={ + "help": "α parameter from the RPO paper (v3), which controls the weighting of the NLL term in the loss. " + "If `None`, no weighting is applied and the loss is the same as the DPO loss. The paper recommends " + "`rpo_alpha=1.0`." + }, + ) + discopop_tau: float = field( + default=0.05, + metadata={ + "help": "τ/temperature parameter from the DiscoPOP paper, which controls the shape of log ratio modulated " + "loss. The paper recommends the default value `discopop_tau=0.05`." + }, + ) + use_num_logits_to_keep: bool = field( + default=False, + metadata={ + "help": "If `True`, only a specified number of logits are computed in the forward pass of CausalLM. " + "This can be useful for saving memory and speeding up training by not computing the logits for all " + "tokens, especially in scenarios when working with very long prompts where labels are ignored (-100)." + }, + ) diff --git a/trl/trainer/gkd_config.py b/trl/trainer/gkd_config.py index e110b047d1..d55bec4b06 100644 --- a/trl/trainer/gkd_config.py +++ b/trl/trainer/gkd_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Optional from .sft_config import SFTConfig @@ -21,7 +21,7 @@ @dataclass class GKDConfig(SFTConfig): """ - Configuration class for GKDTrainer. + Configuration class for [`GKDTrainer`]. Args: temperature (`float`, *optional*, defaults to `0.9`): @@ -34,10 +34,10 @@ class GKDConfig(SFTConfig): beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence. max_new_tokens (`int`, *optional*, defaults to `128`): Maximum number of tokens to generate per completion. - teacher_model_name_or_path (`Optional[str]`, *optional*, defaults to `None`): + teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`): Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being trained. - teacher_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model from a string. disable_dropout (`bool`, *optional*, defaults to `True`): @@ -47,14 +47,54 @@ class GKDConfig(SFTConfig): on teacher-generated output). """ - temperature: float = 0.9 - lmbda: float = 0.5 - beta: float = 0.5 - max_new_tokens: int = 128 - teacher_model_name_or_path: Optional[str] = None - teacher_model_init_kwargs: Optional[dict[str, Any]] = None - disable_dropout: bool = True - seq_kd: bool = False + temperature: float = field( + default=0.9, + metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."}, + ) + lmbda: float = field( + default=0.5, + metadata={ + "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy " + "student-generated outputs)." + }, + ) + beta: float = field( + default=0.5, + metadata={ + "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence " + "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL " + "Divergence." + }, + ) + max_new_tokens: int = field( + default=128, + metadata={"help": "Maximum number of tokens to generate per completion."}, + ) + teacher_model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the " + "model being trained." + }, + ) + teacher_model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "teacher model from a string." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropouts in `model`."}, + ) + seq_kd: bool = field( + default=False, + metadata={ + "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised " + "FT on teacher-generated output)." + }, + ) def __post_init__(self): super().__post_init__() diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py index c29491340d..6b28b51367 100644 --- a/trl/trainer/judges.py +++ b/trl/trainer/judges.py @@ -161,7 +161,7 @@ def judge( This base class should be used to implement binary evaluations as done in section 4.1.4 of the [CGPO paper](https://huggingface.co/papers/2409.20370). - It is relevant for assessing whether or not a prompt completion pair satisfies a specific contraint. + It is relevant for assessing whether a prompt completion pair satisfies a specific contraint. Args: prompts (`list[str]`): List of prompts. diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py index 82926a6aec..5f54ef51d7 100644 --- a/trl/trainer/kto_config.py +++ b/trl/trainer/kto_config.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Any, Literal, Optional +from dataclasses import dataclass, field +from typing import Any, Optional from transformers import TrainingArguments @@ -31,12 +31,12 @@ class KTOConfig(TrainingArguments): learning_rate (`float`, *optional*, defaults to `5e-7`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + max_prompt_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the completion. This argument is required if you want to use the default data collator and your model is an encoder-decoder. beta (`float`, *optional*, defaults to `0.1`): @@ -54,7 +54,7 @@ class KTOConfig(TrainingArguments): Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs. label_pad_token_id (`int`, *optional*, defaults to `-100`): Label pad token id. This argument is required if you want to use the default data collator. - padding_value (`Optional[int]`, *optional*, defaults to `None`): + padding_value (`int` or `None`, *optional*, defaults to `None`): Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`, *optional*, defaults to `"keep_end"`): Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. @@ -62,39 +62,134 @@ class KTOConfig(TrainingArguments): generate_during_eval (`bool`, *optional*, defaults to `False`): If `True`, generates and logs completions from both the model and the reference model to W&B or Comet during evaluation. - is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, you need to specify if the model returned by the callable is an encoder-decoder model. precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): Whether to precompute reference model log probabilities for training and evaluation datasets. This is useful when training without the reference model to reduce the total GPU memory needed. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model from a string. - dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc: (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. disable_dropout (`bool`, *optional*, defaults to `True`): Whether to disable dropout in the model and reference model. """ - learning_rate: float = 1e-6 - max_length: Optional[int] = None - max_prompt_length: Optional[int] = None - max_completion_length: Optional[int] = None - beta: float = 0.1 - loss_type: Literal["kto", "apo_zero_unpaired"] = "kto" - desirable_weight: float = 1.0 - undesirable_weight: float = 1.0 - label_pad_token_id: int = -100 - padding_value: Optional[int] = None - truncation_mode: str = "keep_end" - generate_during_eval: bool = False - is_encoder_decoder: Optional[bool] = None - disable_dropout: bool = True - precompute_ref_log_probs: bool = False - model_init_kwargs: Optional[dict[str, Any]] = None - ref_model_init_kwargs: Optional[dict[str, Any]] = None - dataset_num_proc: Optional[int] = None + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`transformers.TrainingArguments`." + }, + ) + max_length: Optional[int] = field( + default=None, + metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."}, + ) + max_prompt_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the prompt. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + beta: float = field( + default=0.1, + metadata={ + "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from " + "the reference model." + }, + ) + loss_type: str = field( + default="kto", + metadata={ + "help": "Type of loss to use.", + "choices": ["kto", "apo_zero_unpaired"], + }, + ) + desirable_weight: float = field( + default=1.0, + metadata={ + "help": "Desirable losses are weighed by this factor to counter unequal number of desirable and " + "undesirable pairs.", + }, + ) + undesirable_weight: float = field( + default=1.0, + metadata={ + "help": "Undesirable losses are weighed by this factor to counter unequal number of desirable and " + "undesirable pairs.", + }, + ) + label_pad_token_id: int = field( + default=-100, + metadata={ + "help": "Label pad token id. This argument is required if you want to use the default data collator." + }, + ) + padding_value: Optional[int] = field( + default=None, + metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."}, + ) + truncation_mode: str = field( + default="keep_end", + metadata={ + "help": "Truncation mode to use when the prompt is too long.", + "choices": ["keep_end", "keep_start"], + }, + ) + generate_during_eval: bool = field( + default=False, + metadata={ + "help": "If `True`, generates and logs completions from both the model and the reference model to W&B " + "during evaluation." + }, + ) + is_encoder_decoder: Optional[bool] = field( + default=None, + metadata={ + "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` " + "argument, you need to specify if the model returned by the callable is an encoder-decoder model." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model."}, + ) + precompute_ref_log_probs: bool = field( + default=False, + metadata={ + "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. " + "This is useful when training without the reference model to reduce the total GPU memory needed." + }, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model " + "from a string." + }, + ) + ref_model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the " + "reference model from a string." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py index 3261ff8a6a..ec9119f36b 100644 --- a/trl/trainer/model_config.py +++ b/trl/trainer/model_config.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass -from typing import Literal, Optional +from dataclasses import dataclass, field +from typing import Optional @dataclass @@ -26,11 +26,11 @@ class ModelConfig: command line. Parameters: - model_name_or_path (`Optional[str]`, *optional*, defaults to `None`): + model_name_or_path (`str` or `None`, *optional*, defaults to `None`): Model checkpoint for weights initialization. model_revision (`str`, *optional*, defaults to `"main"`): Specific model version to use. It can be a branch name, a tag name, or a commit id. - torch_dtype (`Optional[Literal["auto", "bfloat16", "float16", "float32"]]`, *optional*, defaults to `None`): + torch_dtype (`Literal["auto", "bfloat16", "float16", "float32"]` or `None`, *optional*, defaults to `None`): Override the default `torch.dtype` and load the model under this dtype. Possible values are - `"bfloat16"`: `torch.bfloat16` @@ -42,7 +42,7 @@ class ModelConfig: Whether to allow for custom models defined on the Hub in their own modeling files. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - attn_implementation (`Optional[str]`, *optional*, defaults to `None`): + attn_implementation (`str` or `None`, *optional*, defaults to `None`): Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`. use_peft (`bool`, *optional*, defaults to `False`): @@ -53,9 +53,9 @@ class ModelConfig: LoRA alpha. lora_dropout (`float`, *optional*, defaults to `0.05`): LoRA dropout. - lora_target_modules (`Optional[Union[str, list[str]]]`, *optional*, defaults to `None`): + lora_target_modules (`Union[str, list[str]]` or `None`, *optional*, defaults to `None`): LoRA target modules. - lora_modules_to_save (`Optional[list[str]]`, *optional*, defaults to `None`): + lora_modules_to_save (`list[str]` or `None`, *optional*, defaults to `None`): Model layers to unfreeze & train. lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`): Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling). @@ -72,27 +72,91 @@ class ModelConfig: Whether to use nested quantization. """ - model_name_or_path: Optional[str] = None - model_revision: str = "main" - torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None - trust_remote_code: bool = False - attn_implementation: Optional[str] = None - use_peft: bool = False - lora_r: int = 16 - lora_alpha: int = 32 - lora_dropout: float = 0.05 - lora_target_modules: Optional[list[str]] = None - lora_modules_to_save: Optional[list[str]] = None - lora_task_type: str = "CAUSAL_LM" - use_rslora: bool = False - load_in_8bit: bool = False - load_in_4bit: bool = False - bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4" - use_bnb_nested_quant: bool = False + model_name_or_path: Optional[str] = field( + default=None, + metadata={"help": "Model checkpoint for weights initialization."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "Specific model version to use. It can be a branch name, a tag name, or a commit id."}, + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": "Override the default `torch.dtype` and load the model under this dtype.", + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": "Whether to allow for custom models defined on the Hub in their own modeling files. This option " + "should only be set to `True` for repositories you trust and in which you have read the code, as it will " + "execute code present on the Hub on your local machine." + }, + ) + attn_implementation: Optional[str] = field( + default=None, + metadata={ + "help": "Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in " + "which case you must install this manually by running `pip install flash-attn --no-build-isolation`." + }, + ) + use_peft: bool = field( + default=False, + metadata={"help": "Whether to use PEFT for training."}, + ) + lora_r: int = field( + default=16, + metadata={"help": "LoRA R value."}, + ) + lora_alpha: int = field( + default=32, + metadata={"help": "LoRA alpha."}, + ) + lora_dropout: float = field( + default=0.05, + metadata={"help": "LoRA dropout."}, + ) + lora_target_modules: Optional[list[str]] = field( + default=None, + metadata={"help": "LoRA target modules."}, + ) + lora_modules_to_save: Optional[list[str]] = field( + default=None, + metadata={"help": "Model layers to unfreeze & train."}, + ) + lora_task_type: str = field( + default="CAUSAL_LM", + metadata={"help": "Task type to pass for LoRA (use 'SEQ_CLS' for reward modeling)."}, + ) + use_rslora: bool = field( + default=False, + metadata={ + "help": "Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, " + "instead of the original default value of `lora_alpha/r`." + }, + ) + load_in_8bit: bool = field( + default=False, + metadata={"help": "Whether to use 8 bit precision for the base model. Works only with LoRA."}, + ) + load_in_4bit: bool = field( + default=False, + metadata={"help": "Whether to use 4 bit precision for the base model. Works only with LoRA."}, + ) + bnb_4bit_quant_type: str = field( + default="nf4", + metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]}, + ) + use_bnb_nested_quant: bool = field( + default=False, + metadata={"help": "Whether to use nested quantization."}, + ) def __post_init__(self): if self.load_in_8bit and self.load_in_4bit: raise ValueError("You can't use 8 bit and 4 bit precision at the same time") - if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1: + if hasattr(self.lora_target_modules, "__len__") and len(self.lora_target_modules) == 1: self.lora_target_modules = self.lora_target_modules[0] diff --git a/trl/trainer/nash_md_config.py b/trl/trainer/nash_md_config.py index dadad01f03..c8395fd136 100644 --- a/trl/trainer/nash_md_config.py +++ b/trl/trainer/nash_md_config.py @@ -31,7 +31,14 @@ class NashMDConfig(OnlineDPOConfig): epochs. """ - mixture_coef: list[float] = field(default_factory=lambda: [0.5]) + mixture_coef: list[float] = field( + default_factory=lambda: [0.5], + metadata={ + "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided " + "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the " + "rest of the epochs." + }, + ) def __post_init__(self): super().__post_init__() diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py index 5e75ede883..08a5da542b 100644 --- a/trl/trainer/online_dpo_config.py +++ b/trl/trainer/online_dpo_config.py @@ -13,7 +13,7 @@ # limitations under the License. from dataclasses import dataclass, field -from typing import Literal, Optional +from typing import Optional from transformers import TrainingArguments @@ -31,15 +31,15 @@ class OnlineDPOConfig(TrainingArguments): learning_rate (`float`, *optional*, defaults to `5e-7`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - reward_model_path (`Optional[str]`, *optional*, defaults to `None`): + reward_model_path (`str` or `None`, *optional*, defaults to `None`): Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both. - judge (`Optional[str]`, *optional*, defaults to `None`): + judge (`str` or `None`, *optional*, defaults to `None`): Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both. max_new_tokens (`int`, *optional*, defaults to `64`): Maximum number of tokens to generate per completion. temperature (`float`, *optional*, defaults to `0.9`): Temperature for sampling. The higher the temperature, the more random the completions. - missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`): + missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`): Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive value. @@ -54,22 +54,71 @@ class OnlineDPOConfig(TrainingArguments): - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. disable_dropout (`bool`, *optional*, defaults to `True`): Whether to disable dropout in the model and reference model. """ - learning_rate: float = 5e-7 - reward_model_path: Optional[str] = None - judge: Optional[str] = None - max_new_tokens: int = 64 - temperature: float = 0.9 - missing_eos_penalty: Optional[float] = None - beta: list[float] = field(default_factory=lambda: [0.1]) - loss_type: Literal["sigmoid", "ipo"] = "sigmoid" - dataset_num_proc: Optional[int] = None - disable_dropout: bool = True + learning_rate: float = field( + default=5e-7, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "transformers.TrainingArguments." + }, + ) + reward_model_path: Optional[str] = field( + default=None, + metadata={ + "help": "Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both." + }, + ) + judge: Optional[str] = field( + default=None, + metadata={ + "help": "Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both." + }, + ) + max_new_tokens: int = field( + default=64, + metadata={"help": "Maximum number of tokens to generate per completion."}, + ) + temperature: float = field( + default=0.9, + metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."}, + ) + missing_eos_penalty: Optional[float] = field( + default=None, + metadata={ + "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to " + "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be " + "a positive value." + }, + ) + beta: list[float] = field( + default_factory=lambda: [0.1], + metadata={ + "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from " + "the reference model. For the IPO loss (`loss_type='ipo'`), β is the regularization parameter denoted by " + "τ in the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is " + "selected for each new epoch and the last β is used for the rest of the epochs." + }, + ) + loss_type: str = field( + default="sigmoid", + metadata={ + "help": "Type of loss to use.", + "choices": ["sigmoid", "ipo"], + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model."}, + ) def __post_init__(self): super().__post_init__() diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py index cd892f1d46..f9a55ff47a 100644 --- a/trl/trainer/orpo_config.py +++ b/trl/trainer/orpo_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Optional from transformers import TrainingArguments @@ -31,12 +31,12 @@ class ORPOConfig(TrainingArguments): learning_rate (`float`, *optional*, defaults to `1e-6`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + max_prompt_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the completion. This argument is required if you want to use the default data collator and your model is an encoder-decoder. beta (`float`, *optional*, defaults to `0.1`): @@ -46,33 +46,95 @@ class ORPOConfig(TrainingArguments): Whether to disable dropout in the model. label_pad_token_id (`int`, *optional*, defaults to `-100`): Label pad token id. This argument is required if you want to use the default data collator. - padding_value (`Optional[int]`, *optional*, defaults to `None`): + padding_value (`int` or `None`, *optional*, defaults to `None`): Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`, *optional*, defaults to `"keep_end"`): Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. This argument is required if you want to use the default data collator. generate_during_eval (`bool`, *optional*, defaults to `False`): If `True`, generates and logs completions from the model to W&B or Comet during evaluation. - is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, you need to specify if the model returned by the callable is an encoder-decoder model. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. """ - learning_rate: float = 1e-6 - max_length: Optional[int] = None - max_prompt_length: Optional[int] = None - max_completion_length: Optional[int] = None - beta: float = 0.1 - disable_dropout: bool = True - label_pad_token_id: int = -100 - padding_value: Optional[int] = None - truncation_mode: str = "keep_end" - generate_during_eval: bool = False - is_encoder_decoder: Optional[bool] = None - model_init_kwargs: Optional[dict[str, Any]] = None - dataset_num_proc: Optional[int] = None + learning_rate: float = field( + default=1e-6, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "transformers.TrainingArguments." + }, + ) + max_length: Optional[int] = field( + default=None, + metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."}, + ) + max_prompt_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the prompt. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion. This argument is required if you want to use the default data " + "collator and your model is an encoder-decoder." + }, + ) + beta: float = field( + default=0.1, + metadata={ + "help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is " + "denoted by λ." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model."}, + ) + label_pad_token_id: int = field( + default=-100, + metadata={ + "help": "Label pad token id. This argument is required if you want to use the default data collator." + }, + ) + padding_value: Optional[int] = field( + default=None, + metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."}, + ) + truncation_mode: str = field( + default="keep_end", + metadata={ + "help": "Truncation mode to use when the prompt is too long.", + "choices": ["keep_end", "keep_start"], + }, + ) + generate_during_eval: bool = field( + default=False, + metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."}, + ) + is_encoder_decoder: Optional[bool] = field( + default=None, + metadata={ + "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` " + "argument, you need to specify if the model returned by the callable is an encoder-decoder model." + }, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model " + "from a string." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py index 62a3b0a33e..c98f8aedce 100644 --- a/trl/trainer/ppo_config.py +++ b/trl/trainer/ppo_config.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from ..trainer.utils import OnPolicyConfig @@ -33,9 +33,9 @@ class PPOConfig(OnPolicyConfig): Name of this experiment. reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): Path to the reward model. - model_adapter_name (`Optional[str]`, *optional*, defaults to `None`): + model_adapter_name (`str` or `None`, *optional*, defaults to `None`): Name of the train target PEFT adapter, when using LoRA with multiple adapters. - ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`): + ref_adapter_name (`str` or `None`, *optional*, defaults to `None`): Name of the reference PEFT adapter, when using LoRA with multiple adapters. num_ppo_epochs (`int`, *optional*, defaults to `4`): Number of epochs to train. @@ -55,15 +55,51 @@ class PPOConfig(OnPolicyConfig): Lambda value for GAE. """ - exp_name: str = os.path.basename(__file__)[: -len(".py")] - reward_model_path: str = "EleutherAI/pythia-160m" - model_adapter_name: Optional[str] = None - ref_adapter_name: Optional[str] = None - num_ppo_epochs: int = 4 - whiten_rewards: bool = False - kl_coef: float = 0.05 - cliprange: float = 0.2 - vf_coef: float = 0.1 - cliprange_value: float = 0.2 - gamma: float = 1.0 - lam: float = 0.95 + exp_name: str = field( + default=os.path.basename(__file__)[:-3], + metadata={"help": "Name of this experiment."}, + ) + reward_model_path: str = field( + default="EleutherAI/pythia-160m", + metadata={"help": "Path to the reward model."}, + ) + model_adapter_name: Optional[str] = field( + default=None, + metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."}, + ) + ref_adapter_name: Optional[str] = field( + default=None, + metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."}, + ) + num_ppo_epochs: int = field( + default=4, + metadata={"help": "Number of epochs to train."}, + ) + whiten_rewards: bool = field( + default=False, + metadata={"help": "Whether to whiten the rewards."}, + ) + kl_coef: float = field( + default=0.05, + metadata={"help": "KL coefficient."}, + ) + cliprange: float = field( + default=0.2, + metadata={"help": "Clip range."}, + ) + vf_coef: float = field( + default=0.1, + metadata={"help": "Value function coefficient."}, + ) + cliprange_value: float = field( + default=0.2, + metadata={"help": "Clip range for the value function."}, + ) + gamma: float = field( + default=1.0, + metadata={"help": "Discount factor."}, + ) + lam: float = field( + default=0.95, + metadata={"help": "Lambda value for GAE."}, + ) diff --git a/trl/trainer/prm_config.py b/trl/trainer/prm_config.py index 21a4fc5662..4a1046e2de 100644 --- a/trl/trainer/prm_config.py +++ b/trl/trainer/prm_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from transformers import TrainingArguments @@ -31,9 +31,9 @@ class PRMConfig(TrainingArguments): learning_rate (`float`, *optional*, defaults to `1e-5`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) used for truncation. - max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + max_completion_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the completion used for truncation. The completion is the concatenation of the steps. disable_dropout (`bool`, *optional*, defaults to `True`): Whether to disable dropout in the model. @@ -45,10 +45,37 @@ class PRMConfig(TrainingArguments): Number of processes to use for processing the dataset. """ - learning_rate: float = 1e-5 - max_length: Optional[int] = None - max_completion_length: Optional[int] = None - disable_dropout: bool = True - step_separator: str = "\n" - train_on_last_step_only: bool = False - dataset_num_proc: Optional[int] = None + learning_rate: float = field( + default=1e-5, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`TrainingArguments`." + }, + ) + max_length: Optional[int] = field( + default=None, + metadata={"help": "Maximum length of the sequences (prompt + completion) used for truncation."}, + ) + max_completion_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the completion used for truncation. The completion is the concatenation of the " + "steps." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model and reference model."}, + ) + step_separator: str = field( + default="\n", + metadata={"help": "Separator used to separate each step of the reasoning process."}, + ) + train_on_last_step_only: bool = field( + default=False, + metadata={"help": "Whether to train only on the last step."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) diff --git a/trl/trainer/reward_config.py b/trl/trainer/reward_config.py index 8018a2844c..e19cd1ca7c 100644 --- a/trl/trainer/reward_config.py +++ b/trl/trainer/reward_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from transformers import TrainingArguments @@ -28,7 +28,7 @@ class RewardConfig(TrainingArguments): command line. Parameters: - max_length (`Optional[int]`, *optional*, defaults to `None`): + max_length (`int` or `None`, *optional*, defaults to `None`): Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want to use the default data collator. disable_dropout (`bool`, *optional*, defaults to `True`): @@ -39,12 +39,36 @@ class RewardConfig(TrainingArguments): Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`. remove_unused_columns (`bool`, *optional*, defaults to `False`): - Whether or not to remove the columns that are not used by the model's forward pass. Can be `True` only if + Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if the dataset is pretokenized. """ - max_length: Optional[int] = None - disable_dropout: bool = True - dataset_num_proc: Optional[int] = None - center_rewards_coefficient: Optional[float] = None - remove_unused_columns: bool = False + max_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum length of the sequences (prompt + completion) in the batch. This argument is required if " + "you want to use the default data collator." + }, + ) + disable_dropout: bool = field( + default=True, + metadata={"help": "Whether to disable dropout in the model and reference model."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) + center_rewards_coefficient: Optional[float] = field( + default=None, + metadata={ + "help": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by " + "https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`." + }, + ) + remove_unused_columns: bool = field( + default=False, + metadata={ + "help": "Whether to remove the columns that are not used by the model's forward pass. Can be `True` only " + "if the dataset is pretokenized." + }, + ) diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py index e72b0fc02c..b0aa66bf5e 100644 --- a/trl/trainer/rloo_config.py +++ b/trl/trainer/rloo_config.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from dataclasses import dataclass +from dataclasses import dataclass, field from ..trainer.utils import OnPolicyConfig @@ -44,10 +44,31 @@ class RLOOConfig(OnPolicyConfig): REINFORCE Leave-One-Out (RLOO) number of online samples per prompt. """ - exp_name: str = os.path.basename(__file__)[: -len(".py")] - reward_model_path: str = "EleutherAI/pythia-160m" - num_ppo_epochs: int = 4 - whiten_rewards: bool = False - kl_coef: float = 0.05 - cliprange: float = 0.2 - rloo_k: int = 2 + exp_name: str = field( + default=os.path.basename(__file__)[:-3], + metadata={"help": "Name of this experiment."}, + ) + reward_model_path: str = field( + default="EleutherAI/pythia-160m", + metadata={"help": "Path to the reward model."}, + ) + num_ppo_epochs: int = field( + default=4, + metadata={"help": "Number of epochs to train."}, + ) + whiten_rewards: bool = field( + default=False, + metadata={"help": "Whether to whiten the rewards."}, + ) + kl_coef: float = field( + default=0.05, + metadata={"help": "KL coefficient."}, + ) + cliprange: float = field( + default=0.2, + metadata={"help": "Clip range."}, + ) + rloo_k: int = field( + default=2, + metadata={"help": "REINFORCE Leave-One-Out (RLOO) number of online samples per prompt."}, + ) diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py index 310ede7870..dbc05c6ad5 100644 --- a/trl/trainer/sft_config.py +++ b/trl/trainer/sft_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Optional from transformers import TrainingArguments @@ -35,20 +35,20 @@ class SFTConfig(TrainingArguments): Controls whether the [`ConstantLengthDataset`] packs the sequences of the dataset. learning_rate (`float`, *optional*, defaults to `2e-5`): Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`]. - max_seq_length (`Optional[int]`, *optional*, defaults to `None`): + max_seq_length (`int` or `None`, *optional*, defaults to `None`): Maximum sequence length for the [`ConstantLengthDataset`] and for automatically creating the dataset. If `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. Only used when `packing=False`. dataset_batch_size (`Union[int, None]`, *optional*, defaults to `1000`): Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is None`, tokenizes the full dataset as a single batch. - model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a string. - dataset_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`): + dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets. - eval_packing (`Optional[bool]`, *optional*, defaults to `None`): + eval_packing (`bool` or `None`, *optional*, defaults to `None`): Whether to pack the eval dataset. If `None`, uses the same value as `packing`. num_of_sequences (`int`, *optional*, defaults to `1024`): Number of sequences to use for the [`ConstantLengthDataset`]. @@ -59,15 +59,67 @@ class SFTConfig(TrainingArguments): Monkey patch the model with Liger kernels to increase throughput and reduce memory usage. """ - dataset_text_field: str = "text" - packing: bool = False - learning_rate: float = 2.0e-5 - max_seq_length: Optional[int] = None - dataset_num_proc: Optional[int] = None - dataset_batch_size: int = 1000 - model_init_kwargs: Optional[dict[str, Any]] = None - dataset_kwargs: Optional[dict[str, Any]] = None - eval_packing: Optional[bool] = None - num_of_sequences: int = 1024 - chars_per_token: float = 3.6 - use_liger: bool = False + dataset_text_field: str = field( + default="text", + metadata={ + "help": "Name of the text field of the dataset. If provided, the trainer will automatically create a " + "`ConstantLengthDataset` based on `dataset_text_field`." + }, + ) + packing: bool = field( + default=False, + metadata={"help": "Controls whether the `ConstantLengthDataset` packs the sequences of the dataset."}, + ) + learning_rate: float = field( + default=2.0e-5, + metadata={ + "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of " + "`TrainingArguments`." + }, + ) + max_seq_length: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum sequence length for the `ConstantLengthDataset` and for automatically creating the " + "dataset. If `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`." + }, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset. Only used when `packing=False`."}, + ) + dataset_batch_size: int = field( + default=1000, + metadata={ + "help": "Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is " + "None`, tokenizes the full dataset as a single batch." + }, + ) + model_init_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model " + "from a string." + }, + ) + dataset_kwargs: Optional[dict[str, Any]] = field( + default=None, + metadata={ + "help": "Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets." + }, + ) + eval_packing: Optional[bool] = field( + default=None, + metadata={"help": "Whether to pack the eval dataset. If `None`, uses the same value as `packing`."}, + ) + num_of_sequences: int = field( + default=1024, + metadata={"help": "Number of sequences to use for the `ConstantLengthDataset`."}, + ) + chars_per_token: float = field( + default=3.6, metadata={"help": "Number of characters per token to use for the `ConstantLengthDataset`."} + ) + use_liger: bool = field( + default=False, + metadata={"help": "Monkey patch the model with Liger kernels to increase throughput and reduce memory usage."}, + ) diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py index 5577c57481..f4d8510b9b 100644 --- a/trl/trainer/utils.py +++ b/trl/trainer/utils.py @@ -18,7 +18,7 @@ import random import warnings from collections import deque -from dataclasses import dataclass +from dataclasses import dataclass, field from importlib.metadata import version from typing import Any, Literal, Optional, Union @@ -73,7 +73,7 @@ class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling): differently if it does not have proper context. instruction_template (`Union[str, list[int]]`): the template form that indicates the start of the human instruction, typically something like '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids. - mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying + mlm (`bool`, *optional*, defaults to `False`): Whether to use masked language modeling in the underlying `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present for flexibility and backwards-compatibility. ignore_index (`int`, *optional*, defaults to `-100`): @@ -336,7 +336,7 @@ class RewardDataCollatorWithPadding: The tokenizer used for encoding the data. padding (`Union[bool, str, `PaddingStrategy`]`, `optional`, defaults to `True`): padding_strategy to pass to the tokenizer. - pad_to_multiple_of (`Optional[int]`, `optional`, defaults to `None`): + pad_to_multiple_of (`int` or `None`, `optional`, defaults to `None`): If set will pad the sequence to a multiple of the provided value. return_tensors (`str`, `optional`, defaults to `"pt"`): The tensor type to use. @@ -463,8 +463,8 @@ class DPODataCollatorWithPadding: The tokenizer's pad_token_id. label_pad_token_id (`int`, defaults to -100): The label used for masking. - is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): - Whether or not you model has an encoder_decoder architecture. + is_encoder_decoder (`bool` or `None`, `optional`, defaults to `None`): + Whether you model has an encoder_decoder architecture. """ pad_token_id: int = 0 @@ -548,7 +548,7 @@ class ConstantLengthDataset(IterableDataset): The processor used for processing the data. dataset (`dataset.Dataset`): Dataset with text files. - dataset_text_field (`Optional[str]`, *optional*, defaults to `None`): + dataset_text_field (`str` or `None`, *optional*, defaults to `None`): Name of the field in the dataset that contains the text. Only one of `dataset_text_field` and `formatting_func` should be provided. formatting_func (`Callable`, *optional*): @@ -978,13 +978,13 @@ class OnPolicyConfig(TrainingArguments): command line. Parameters: - run_name (`Optional[str]`, *optional*, defaults to `None`): + run_name (`str` or `None`, *optional*, defaults to `None`): Name of the run. - dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. num_mini_batches (`int`, *optional*, defaults to `1`): Number of minibatches to split a batch into. - total_episodes (`Optional[int]`, *optional*, defaults to `None`): + total_episodes (`int` or `None`, *optional*, defaults to `None`): Total number of episodes in the dataset. local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`): Per rank no grad forward pass in the rollout phase. @@ -992,56 +992,125 @@ class OnPolicyConfig(TrainingArguments): Number of debugging samples generations (i.e., `generate_completions` calls) throughout training. response_length (`int`, *optional*, defaults to `53`): Length of the response. - stop_token (`Optional[str]`, *optional*, defaults to `None`): + stop_token (`str` or `None`, *optional*, defaults to `None`): Stop token. - stop_token_id (`Optional[int]`, *optional*, defaults to `None`): + stop_token_id (`int` or `None`, *optional*, defaults to `None`): Truncation token id. temperature (`float`, *optional*, defaults to `0.7`): Sampling temperature. - missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`): + missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`): Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive value. sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): Path to the SFT model. - world_size (`Optional[int]`, *optional*, defaults to `None`): + world_size (`int` or `None`, *optional*, defaults to `None`): Number of processes (GPUs) to use for the training. - num_total_batches (`Optional[int]`, *optional*, defaults to `None`): + num_total_batches (`int` or `None`, *optional*, defaults to `None`): Number of total batches to train. - micro_batch_size (`Optional[int]`, *optional*, defaults to `None`): + micro_batch_size (`int` or `None`, *optional*, defaults to `None`): Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`). - local_batch_size (`Optional[int]`, *optional*, defaults to `None`): + local_batch_size (`int` or `None`, *optional*, defaults to `None`): Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`). - batch_size (`Optional[int]`, *optional*, defaults to `None`): + batch_size (`int` or `None`, *optional*, defaults to `None`): Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`). - local_mini_batch_size (`Optional[int]`, *optional*, defaults to `None`): + local_mini_batch_size (`int` or `None`, *optional*, defaults to `None`): Mini batch size per GPU. - mini_batch_size (`Optional[int]`, *optional*, defaults to `None`): + mini_batch_size (`int` or `None`, *optional*, defaults to `None`): Mini batch size across GPUs. push_to_hub (`bool`, *optional*, defaults to `False`): Whether to push the model to the Hub after training. """ - run_name: Optional[str] = None - dataset_num_proc: Optional[int] = None - num_mini_batches: int = 1 - total_episodes: Optional[int] = None - local_rollout_forward_batch_size: int = 64 - num_sample_generations: int = 10 - response_length: int = 53 - stop_token: Optional[Literal["eos"]] = None - stop_token_id: Optional[int] = None - temperature: float = 0.7 - missing_eos_penalty: Optional[float] = None - sft_model_path: str = "EleutherAI/pythia-160m" - world_size: Optional[int] = None - num_total_batches: Optional[int] = None - micro_batch_size: Optional[int] = None - local_batch_size: Optional[int] = None - batch_size: Optional[int] = None - local_mini_batch_size: Optional[int] = None - mini_batch_size: Optional[int] = None - push_to_hub: bool = False + run_name: Optional[str] = field( + default=None, + metadata={"help": "Name of the run."}, + ) + dataset_num_proc: Optional[int] = field( + default=None, + metadata={"help": "Number of processes to use for processing the dataset."}, + ) + num_mini_batches: int = field( + default=1, + metadata={"help": "Number of minibatches to split a batch into."}, + ) + total_episodes: Optional[int] = field( + default=None, + metadata={"help": "Total number of episodes in the dataset."}, + ) + local_rollout_forward_batch_size: int = field( + default=64, + metadata={"help": "Per rank no grad forward pass in the rollout phase."}, + ) + num_sample_generations: int = field( + default=10, + metadata={ + "help": "Number of debugging samples generations (i.e., `generate_completions` calls) throughout training." + }, + ) + response_length: int = field( + default=53, + metadata={"help": "Length of the response."}, + ) + stop_token: Optional[Literal["eos"]] = field( + default=None, + metadata={"help": "Stop token."}, + ) + stop_token_id: Optional[int] = field( + default=None, + metadata={"help": "Truncation token id."}, + ) + temperature: float = field( + default=0.7, + metadata={"help": "Sampling temperature."}, + ) + missing_eos_penalty: Optional[float] = field( + default=None, + metadata={ + "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to " + "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be " + "a positive value." + }, + ) + sft_model_path: str = field( + default="EleutherAI/pythia-160m", + metadata={"help": "Path to the SFT model."}, + ) + world_size: Optional[int] = field( + default=None, + metadata={"help": "Number of processes (GPUs) to use for the training."}, + ) + num_total_batches: Optional[int] = field( + default=None, + metadata={"help": "Number of total batches to train."}, + ) + micro_batch_size: Optional[int] = field( + default=None, + metadata={"help": "Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)."}, + ) + local_batch_size: Optional[int] = field( + default=None, + metadata={"help": "Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)."}, + ) + batch_size: Optional[int] = field( + default=None, + metadata={ + "help": "Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * " + "`gradient_accumulation_steps`)." + }, + ) + local_mini_batch_size: Optional[int] = field( + default=None, + metadata={"help": "Mini batch size per GPU."}, + ) + mini_batch_size: Optional[int] = field( + default=None, + metadata={"help": "Mini batch size across GPUs."}, + ) + push_to_hub: bool = field( + default=False, + metadata={"help": "Whether to push the model to the Hub after training."}, + ) def first_true_indices(bools: torch.Tensor, dtype=torch.long): diff --git a/trl/trainer/xpo_config.py b/trl/trainer/xpo_config.py index ffeacbb961..8ae925994b 100644 --- a/trl/trainer/xpo_config.py +++ b/trl/trainer/xpo_config.py @@ -26,10 +26,17 @@ class XPOConfig(OnlineDPOConfig): Parameters: alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`): - Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch and the last alpha is used for the rest of the epochs. + Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch + and the last alpha is used for the rest of the epochs. """ - alpha: list[float] = field(default_factory=lambda: [1e-5]) + alpha: list[float] = field( + default_factory=lambda: [1e-5], + metadata={ + "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each " + "new epoch and the last alpha is used for the rest of the epochs." + }, + ) def __post_init__(self): super().__post_init__()