From 52d213173ff844bc2ac5369c22ce35110a2bbe9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:29:09 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=9C=20Use=20field=20in=20dataclasses?=
 =?UTF-8?q?=20(#2494)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* in hh-rlhf-helpful-base

* delete tokenize ds

* dataset scripts

* alignprop

* judge tldr

* ddpo

* zen

* sft video

* literal to choices

* chat

* script args

* alignprop

* bco

* better help format

* cpo

* ddpo

* whether or not -> whether

* dpo

* dont set the possible values

* `Optional[...]` to ... or `None`

* xpo

* gkd

* kto

* nash

* online dpo

* Fix typo in learning rate help message

* orpo

* more ... or `None`

* model config

* ppo

* prm

* reward

* rloo

* sft

* online policy config

* make style
---
 examples/datasets/hh-rlhf-helpful-base.py     |  17 +-
 .../lm-human-preferences-descriptiveness.py   |  19 +-
 .../lm-human-preferences-sentiment.py         |  19 +-
 examples/datasets/math_shepherd.py            |  19 +-
 examples/datasets/prm800k.py                  |  19 +-
 examples/datasets/rlaif-v.py                  |  19 +-
 examples/datasets/tldr.py                     |  19 +-
 examples/datasets/tldr_preference.py          |  19 +-
 examples/datasets/tokenize_ds.py              |  54 ----
 examples/datasets/ultrafeedback-prompt.py     |  19 +-
 examples/datasets/ultrafeedback.py            |  81 +++--
 examples/scripts/alignprop.py                 |  28 +-
 examples/scripts/ddpo.py                      |  28 +-
 examples/scripts/evals/judge_tldr.py          |  20 +-
 examples/scripts/sft_video_llm.py             |  12 +-
 scripts/generate_zen_dataset.py               |  17 +-
 trl/data_utils.py                             |   6 +-
 trl/mergekit_utils.py                         |   8 +-
 trl/models/modeling_sd_base.py                |   4 +-
 trl/models/utils.py                           |   2 +-
 trl/scripts/chat.py                           | 130 +++++---
 trl/scripts/utils.py                          |  32 +-
 trl/trainer/alignprop_config.py               | 119 +++++---
 trl/trainer/bco_config.py                     | 138 +++++++--
 trl/trainer/cpo_config.py                     | 124 ++++++--
 trl/trainer/ddpo_config.py                    | 216 ++++++++++---
 trl/trainer/dpo_config.py                     | 285 ++++++++++++++----
 trl/trainer/gkd_config.py                     |  64 +++-
 trl/trainer/judges.py                         |   2 +-
 trl/trainer/kto_config.py                     | 151 ++++++++--
 trl/trainer/model_config.py                   | 114 +++++--
 trl/trainer/nash_md_config.py                 |   9 +-
 trl/trainer/online_dpo_config.py              |  79 ++++-
 trl/trainer/orpo_config.py                    | 104 +++++--
 trl/trainer/ppo_config.py                     |  66 +++-
 trl/trainer/prm_config.py                     |  47 ++-
 trl/trainer/reward_config.py                  |  40 ++-
 trl/trainer/rloo_config.py                    |  37 ++-
 trl/trainer/sft_config.py                     |  88 ++++--
 trl/trainer/utils.py                          | 147 ++++++---
 trl/trainer/xpo_config.py                     |  11 +-
 41 files changed, 1827 insertions(+), 605 deletions(-)
 delete mode 100644 examples/datasets/tokenize_ds.py

diff --git a/examples/datasets/hh-rlhf-helpful-base.py b/examples/datasets/hh-rlhf-helpful-base.py
index e089ed108e..2a68daf7ec 100644
--- a/examples/datasets/hh-rlhf-helpful-base.py
+++ b/examples/datasets/hh-rlhf-helpful-base.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -30,13 +30,20 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/hh-rlhf-helpful-base"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/hh-rlhf-helpful-base"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/hh-rlhf-helpful-base", metadata={"help": "Hugging Face repository ID to push the dataset to."}
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None, metadata={"help": "Number of workers to use for dataset processing."}
+    )
 
 
 def common_start(str1: str, str2: str) -> str:
diff --git a/examples/datasets/lm-human-preferences-descriptiveness.py b/examples/datasets/lm-human-preferences-descriptiveness.py
index 621757770c..a078b1a0eb 100644
--- a/examples/datasets/lm-human-preferences-descriptiveness.py
+++ b/examples/datasets/lm-human-preferences-descriptiveness.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-descriptiveness"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/lm-human-preferences-descriptiveness"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/lm-human-preferences-descriptiveness",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 # Edge cases handling: remove the cases where all samples are the same
diff --git a/examples/datasets/lm-human-preferences-sentiment.py b/examples/datasets/lm-human-preferences-sentiment.py
index a3eaa4d06e..cbacab91a9 100644
--- a/examples/datasets/lm-human-preferences-sentiment.py
+++ b/examples/datasets/lm-human-preferences-sentiment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-sentiment"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/lm-human-preferences-sentiment"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/lm-human-preferences-sentiment",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_prompt_completion(example, tokenizer):
diff --git a/examples/datasets/math_shepherd.py b/examples/datasets/math_shepherd.py
index c09e745ad5..214636fcde 100644
--- a/examples/datasets/math_shepherd.py
+++ b/examples/datasets/math_shepherd.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 
@@ -31,13 +31,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/math_shepherd"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/math_shepherd"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/math_shepherd",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def process_example(example):
diff --git a/examples/datasets/prm800k.py b/examples/datasets/prm800k.py
index b5f95742be..3078ab71ad 100644
--- a/examples/datasets/prm800k.py
+++ b/examples/datasets/prm800k.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/prm800k"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/prm800k"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/prm800k",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def process_example(example):
diff --git a/examples/datasets/rlaif-v.py b/examples/datasets/rlaif-v.py
index 84ae292f87..dfe87d4d83 100644
--- a/examples/datasets/rlaif-v.py
+++ b/examples/datasets/rlaif-v.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import features, load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/rlaif-v"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/rlaif-v"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/rlaif-v",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_conversational(example):
diff --git a/examples/datasets/tldr.py b/examples/datasets/tldr.py
index 0ae29481e3..767385c339 100644
--- a/examples/datasets/tldr.py
+++ b/examples/datasets/tldr.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/tldr"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/tldr"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/tldr",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_prompt_completion(example):
diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py
index 1c4ff5bcbd..aa110af0c6 100644
--- a/examples/datasets/tldr_preference.py
+++ b/examples/datasets/tldr_preference.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/tldr-preference"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/tldr-preference"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/tldr-preference",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_preference(example):
diff --git a/examples/datasets/tokenize_ds.py b/examples/datasets/tokenize_ds.py
deleted file mode 100644
index cd96a685a9..0000000000
--- a/examples/datasets/tokenize_ds.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-from datasets import load_dataset
-from transformers import AutoTokenizer, HfArgumentParser
-
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
-
-
-"""
-python -i examples/datasets/tokenize_ds.py --model HuggingFaceH4/zephyr-7b-beta
-python -i examples/datasets/tokenize_ds.py --model gpt2
-"""
-
-
-@dataclass
-class ScriptArguments:
-    dataset_name: str = field(
-        default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The dataset to load"}
-    )
-    model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"})
-    dataset_num_proc: Optional[int] = field(
-        default=None, metadata={"help": "The number of workers to use to tokenize the data"}
-    )
-
-
-if __name__ == "__main__":
-    script_args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0]
-    dataset = load_dataset(script_args.dataset_name)
-    tokenizer = AutoTokenizer.from_pretrained(script_args.model)
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
-    def process(row):
-        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
-        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
-        return row
-
-    dataset = dataset.map(process, num_proc=script_args.dataset_num_proc)
-    print(dataset["train"][0]["chosen"])
diff --git a/examples/datasets/ultrafeedback-prompt.py b/examples/datasets/ultrafeedback-prompt.py
index 3cb92467d5..7fecadc403 100644
--- a/examples/datasets/ultrafeedback-prompt.py
+++ b/examples/datasets/ultrafeedback-prompt.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -29,13 +29,22 @@ class ScriptArguments:
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-prompt"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/ultrafeedback-prompt"
-    dataset_num_proc: Optional[int] = None
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/ultrafeedback-prompt",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_unpaired_preference(example):
diff --git a/examples/datasets/ultrafeedback.py b/examples/datasets/ultrafeedback.py
index cb6c556d0c..9670bef6d3 100644
--- a/examples/datasets/ultrafeedback.py
+++ b/examples/datasets/ultrafeedback.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from datasets import load_dataset
@@ -27,46 +27,61 @@ class ScriptArguments:
     Args:
         model_name (`str`, *optional*, defaults to `"gpt-3.5-turbo"`):
             Language model to target. Possible values are:
-
-                - `"alpaca-7b"`
-                - `"bard"`
-                - `"falcon-40b-instruct"`
-                - `"gpt-3.5-turbo"` (default)
-                - `"gpt-4"`
-                - `"llama-2-13b-chat"`
-                - `"llama-2-70b-chat"`
-                - `"llama-2-7b-chat"`
-                - `"mpt-30b-chat"`
-                - `"pythia-12b"`
-                - `"starchat"`
-                - `"ultralm-13b"`
-                - `"ultralm-65b"`
-                - `"vicuna-33b"`
-                - `"wizardlm-13b"`
-                - `"wizardlm-70b"`
-                - `"wizardlm-7b"`
-
         aspect (`str`, *optional*, defaults to `"helpfulness"`):
-            Aspect to target. Possible values are:
-
-                - `"helpfulness"` (default)
-                - `"honesty"`
-                - `"instruction-following"`
-                - `"truthfulness"`
-
+            Aspect to target.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the dataset to the Hugging Face Hub.
         repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness"`):
             Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of workers to use for dataset processing.
     """
 
-    model_name: str = "gpt-3.5-turbo"
-    aspect: str = "helpfulness"
-    push_to_hub: bool = False
-    repo_id: str = "trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness"
-    dataset_num_proc: Optional[int] = None
+    model_name: str = field(
+        default="gpt-3.5-turbo",
+        metadata={
+            "help": "Language model to target.",
+            "choices": [
+                "alpaca-7b",
+                "bard",
+                "falcon-40b-instruct",
+                "gpt-3.5-turbo",
+                "gpt-4",
+                "llama-2-13b-chat",
+                "llama-2-70b-chat",
+                "llama-2-7b-chat",
+                "mpt-30b-chat",
+                "pythia-12b",
+                "starchat",
+                "ultralm-13b",
+                "ultralm-65b",
+                "vicuna-33b",
+                "wizardlm-13b",
+                "wizardlm-70b",
+                "wizardlm-7b",
+            ],
+        },
+    )
+    aspect: str = field(
+        default="helpfulness",
+        metadata={
+            "help": "Aspect to target. Possible values are: 'helpfulness' (default), 'honesty', "
+            "'instruction-following', 'truthfulness'.",
+            "choices": ["helpfulness", "honesty", "instruction-following", "truthfulness"],
+        },
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
 
 
 def to_unpaired_preference(example, model_name, aspect):
diff --git a/examples/scripts/alignprop.py b/examples/scripts/alignprop.py
index 918619c1e8..2203d06e32 100644
--- a/examples/scripts/alignprop.py
+++ b/examples/scripts/alignprop.py
@@ -37,20 +37,38 @@
 
 @dataclass
 class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        pretrained_model (`str`, *optional*, defaults to `"runwayml/stable-diffusion-v1-5"`):
+            Pretrained model to use.
+        pretrained_revision (`str`, *optional*, defaults to `"main"`):
+            Pretrained model revision to use.
+        hf_hub_model_id (`str`, *optional*, defaults to `"alignprop-finetuned-stable-diffusion"`):
+            HuggingFace repo to save model weights to.
+        hf_hub_aesthetic_model_id (`str`, *optional*, defaults to `"trl-lib/ddpo-aesthetic-predictor"`):
+            Hugging Face model ID for aesthetic scorer model weights.
+        hf_hub_aesthetic_model_filename (`str`, *optional*, defaults to `"aesthetic-model.pth"`):
+            Hugging Face model filename for aesthetic scorer model weights.
+        use_lora (`bool`, *optional*, defaults to `True`):
+            Whether to use LoRA.
+    """
+
     pretrained_model: str = field(
-        default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"}
+        default="runwayml/stable-diffusion-v1-5", metadata={"help": "Pretrained model to use."}
     )
-    pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"})
+    pretrained_revision: str = field(default="main", metadata={"help": "Pretrained model revision to use."})
     hf_hub_model_id: str = field(
-        default="alignprop-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"}
+        default="alignprop-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to."}
     )
     hf_hub_aesthetic_model_id: str = field(
         default="trl-lib/ddpo-aesthetic-predictor",
-        metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"},
+        metadata={"help": "Hugging Face model ID for aesthetic scorer model weights."},
     )
     hf_hub_aesthetic_model_filename: str = field(
         default="aesthetic-model.pth",
-        metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"},
+        metadata={"help": "Hugging Face model filename for aesthetic scorer model weights."},
     )
     use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."})
 
diff --git a/examples/scripts/ddpo.py b/examples/scripts/ddpo.py
index 7919d5244a..07b1fbe84c 100644
--- a/examples/scripts/ddpo.py
+++ b/examples/scripts/ddpo.py
@@ -41,20 +41,38 @@
 
 @dataclass
 class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        pretrained_model (`str`, *optional*, defaults to `"runwayml/stable-diffusion-v1-5"`):
+            Pretrained model to use.
+        pretrained_revision (`str`, *optional*, defaults to `"main"`):
+            Pretrained model revision to use.
+        hf_hub_model_id (`str`, *optional*, defaults to `"ddpo-finetuned-stable-diffusion"`):
+            HuggingFace repo to save model weights to.
+        hf_hub_aesthetic_model_id (`str`, *optional*, defaults to `"trl-lib/ddpo-aesthetic-predictor"`):
+            Hugging Face model ID for aesthetic scorer model weights.
+        hf_hub_aesthetic_model_filename (`str`, *optional*, defaults to `"aesthetic-model.pth"`):
+            Hugging Face model filename for aesthetic scorer model weights.
+        use_lora (`bool`, *optional*, defaults to `True`):
+            Whether to use LoRA.
+    """
+
     pretrained_model: str = field(
-        default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"}
+        default="runwayml/stable-diffusion-v1-5", metadata={"help": "Pretrained model to use."}
     )
-    pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"})
+    pretrained_revision: str = field(default="main", metadata={"help": "Pretrained model revision to use."})
     hf_hub_model_id: str = field(
-        default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"}
+        default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to."}
     )
     hf_hub_aesthetic_model_id: str = field(
         default="trl-lib/ddpo-aesthetic-predictor",
-        metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"},
+        metadata={"help": "Hugging Face model ID for aesthetic scorer model weights."},
     )
     hf_hub_aesthetic_model_filename: str = field(
         default="aesthetic-model.pth",
-        metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"},
+        metadata={"help": "Hugging Face model filename for aesthetic scorer model weights."},
     )
     use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."})
 
diff --git a/examples/scripts/evals/judge_tldr.py b/examples/scripts/evals/judge_tldr.py
index f9e51df729..537415d62c 100644
--- a/examples/scripts/evals/judge_tldr.py
+++ b/examples/scripts/evals/judge_tldr.py
@@ -47,14 +47,28 @@
 
 @dataclass
 class ScriptArguments:
-    model_name_or_path: str = field(metadata={"help": "The model name or path to the model to evaluate."})
+    r"""
+    Arguments for the script.
+
+    Args:
+        model_name_or_path (`str`):
+            Model name or path to the model to evaluate.
+        judge_model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3-70B-Instruct"`):
+            Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or
+            'meta-llama/Meta-Llama-3-70B-Instruct'.
+        num_examples (`int` or `None`, *optional*, defaults to `None`):
+            Number of examples to evaluate.
+    """
+
+    model_name_or_path: str = field(metadata={"help": "Model name or path to the model to evaluate."})
     judge_model: str = field(
         default="meta-llama/Meta-Llama-3-70B-Instruct",
         metadata={
-            "help": "The model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125', 'meta-llama/Meta-Llama-3-70B-Instruct'."
+            "help": "Model name or path to the model to use as a judge. E.g., 'gpt-3.5-turbo-0125' or "
+            "'meta-llama/Meta-Llama-3-70B-Instruct'."
         },
     )
-    num_examples: Optional[int] = field(default=None, metadata={"help": "The number of examples to evaluate."})
+    num_examples: Optional[int] = field(default=None, metadata={"help": "Number of examples to evaluate."})
 
 
 # Parse the arguments
diff --git a/examples/scripts/sft_video_llm.py b/examples/scripts/sft_video_llm.py
index 4a85114d4f..dd64936077 100644
--- a/examples/scripts/sft_video_llm.py
+++ b/examples/scripts/sft_video_llm.py
@@ -45,7 +45,7 @@
 import json
 import os
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any
 
 import requests
@@ -152,7 +152,15 @@ def collate_fn(examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
 
 @dataclass
 class CustomScriptArguments(ScriptArguments):
-    video_cache_dir: str = "/tmp/videos/"
+    r"""
+    Arguments for the script.
+
+    Args:
+        video_cache_dir (`str`, *optional*, defaults to `"/tmp/videos/"`):
+            Video cache directory.
+    """
+
+    video_cache_dir: str = field(default="/tmp/videos/", metadata={"help": "Video cache directory."})
 
 
 if __name__ == "__main__":
diff --git a/scripts/generate_zen_dataset.py b/scripts/generate_zen_dataset.py
index 73c7c16f82..4bfc4e23f6 100644
--- a/scripts/generate_zen_dataset.py
+++ b/scripts/generate_zen_dataset.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from datasets import Dataset
 from transformers import HfArgumentParser
@@ -32,9 +32,18 @@ class ScriptArguments:
             Hugging Face repository ID to push the dataset to.
     """
 
-    test_size: float = 0.1
-    push_to_hub: bool = False
-    repo_id: str = "trl-internal-testing/zen"
+    test_size: float = field(
+        default=0.1,
+        metadata={"help": "Fraction of the dataset to include in the test split."},
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-internal-testing/zen",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
 
 
 def main(test_size, push_to_hub, repo_id):
diff --git a/trl/data_utils.py b/trl/data_utils.py
index 8c8f448adf..fc8cd775a5 100644
--- a/trl/data_utils.py
+++ b/trl/data_utils.py
@@ -173,7 +173,7 @@ def maybe_apply_chat_template(
             messages, where each message is a dictionary with keys `"role"` and `"content"`.
         tokenizer (`PreTrainedTokenizer`):
             The tokenizer to apply the chat template with.
-        tools (`Optional[list[Union[dict, Callable]]]`, *optional*, defaults to `None`):
+        tools (`list[Union[dict, Callable]]` or `None`, *optional*, defaults to `None`):
             A list of tools (callable functions) that will be accessible to the model.
             If the template does not support function calling, this argument will have no effect
 
@@ -224,7 +224,7 @@ def unpair_preference_dataset(
         dataset (`Dataset` or `DatasetDict`):
             Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
             `"prompt"`.
-        num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         desc (`str` or `None`, *optional*, defaults to `None`):
             Meaningful description to be displayed alongside with the progress bar while mapping examples.
@@ -265,7 +265,7 @@ def maybe_unpair_preference_dataset(
         dataset (`Dataset` or `DatasetDict`):
             Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
             `"prompt"`.
-        num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         desc (`str` or `None`, *optional*, defaults to `None`):
             Meaningful description to be displayed alongside with the progress bar while mapping examples.
diff --git a/trl/mergekit_utils.py b/trl/mergekit_utils.py
index 936c42626c..382b91e61a 100644
--- a/trl/mergekit_utils.py
+++ b/trl/mergekit_utils.py
@@ -59,14 +59,14 @@ class MergeConfig:
 
     Attributes:
         method (`str`): The merge method to use.
-        policy_model_path (`Optional[str]`): Path to the policy model.
-        target_model_path (`Optional[str]`): Path to the target model.
+        policy_model_path (`str` or `None`): Path to the policy model.
+        target_model_path (`str` or `None`): Path to the target model.
         policy_model_weight (`float`): Weight for the policy model (for `linear` and `ties` methods).
         target_model_weight (`float`): Weight for the target model (for `linear` and `ties` methods).
         policy_model_density (`list[float]`): Density parameters for the policy model (for `ties` and `dare_ties`).
         target_model_density (`list[float]`): Density parameters for the target model (for `ties` and `dare_ties`).
-        normalize (`Optional[float]`): Normalization factor for the TIES method.
-        t_values (`Optional[float]`): Interpolation factor for the SLERP method.
+        normalize (`float` or `None`): Normalization factor for the TIES method.
+        t_values (`float` or `None`): Interpolation factor for the SLERP method.
         dtype (`str`): Data type to use for merging, e.g., `"float16"`.
     """
 
diff --git a/trl/models/modeling_sd_base.py b/trl/models/modeling_sd_base.py
index 131d8d8016..dbe1cfd8f0 100644
--- a/trl/models/modeling_sd_base.py
+++ b/trl/models/modeling_sd_base.py
@@ -384,7 +384,7 @@ def pipeline_step(
             The output format of the generate image. Choose between
             [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
         return_dict (`bool`, *optional*, defaults to `True`):
-            Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+            Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
             plain tuple.
         callback (`Callable`, *optional*):
             A function that will be called every `callback_steps` steps during inference. The function will be
@@ -615,7 +615,7 @@ def pipeline_step_with_grad(
             The output format of the generate image. Choose between
             [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
         return_dict (`bool`, *optional*, defaults to `True`):
-            Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+            Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
             plain tuple.
         callback (`Callable`, *optional*):
             A function that will be called every `callback_steps` steps during inference. The function will be
diff --git a/trl/models/utils.py b/trl/models/utils.py
index 53cf481f1f..3f26146d0e 100644
--- a/trl/models/utils.py
+++ b/trl/models/utils.py
@@ -90,7 +90,7 @@ def setup_chat_format(
         model (`~transformers.PreTrainedModel`): The model to be modified.
         tokenizer (`~transformers.PreTrainedTokenizer`): The tokenizer to be modified.
         format (`Optional[Literal["chatml"]]`): The format to be set. Defaults to "chatml".
-        resize_to_multiple_of (`Optional[int]`): Number to resize the embedding layer to. Defaults to None.
+        resize_to_multiple_of (`int` or `None`): Number to resize the embedding layer to. Defaults to None.
 
     Returns:
         model (`~transformers.PreTrainedModel`): The modified model.
diff --git a/trl/scripts/chat.py b/trl/scripts/chat.py
index fa9eebc44e..5adc6ea779 100644
--- a/trl/scripts/chat.py
+++ b/trl/scripts/chat.py
@@ -22,6 +22,7 @@
 import time
 from dataclasses import dataclass, field
 from threading import Thread
+from typing import Optional
 
 import torch
 import yaml
@@ -81,67 +82,118 @@
 
 @dataclass
 class ChatArguments:
-    # general settings
-    model_name_or_path: str = field(metadata={"help": "Name of the pre-trained model"})
-    user: str = field(default=None, metadata={"help": "Username to display in chat interface"})
-    system_prompt: str = field(default=None, metadata={"help": "System prompt"})
-    save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history"})
-    device: str = field(
-        default="cpu",
-        metadata={"help": "device to use for inference."},
-    )
-    examples_path: str = field(default=None, metadata={"help": "Path to a yaml file with examples"})
-    # generation settings
-    max_new_tokens: int = field(default=256, metadata={"help": "Maximum number of tokens to generate"})
-    do_sample: bool = field(default=True, metadata={"help": "Whether to sample outputs during generation"})
-    num_beams: int = field(default=1, metadata={"help": "Number of beams for beam search"})
-    temperature: float = field(default=1.0, metadata={"help": "Temperature parameter for generation"})
-    top_k: int = field(default=50, metadata={"help": "Value of k for top-k sampling"})
-    top_p: float = field(default=1.0, metadata={"help": "Value of p for nucleus sampling"})
-    repetition_penalty: float = field(default=1.0, metadata={"help": "Repetition penalty"})
-    eos_tokens: str = field(
+    r"""
+    Arguments for the chat script.
+
+    Args:
+        model_name_or_path (`str`):
+            Name of the pre-trained model.
+        user (`str` or `None`, *optional*, defaults to `None`):
+            Username to display in chat interface.
+        system_prompt (`str` or `None`, *optional*, defaults to `None`):
+            System prompt.
+        save_folder (`str`, *optional*, defaults to `"./chat_history/"`):
+            Folder to save chat history.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            Device to use for inference.
+        examples_path (`str` or `None`, *optional*, defaults to `None`):
+            Path to a yaml file with examples.
+        max_new_tokens (`int`, *optional*, defaults to `256`):
+            Maximum number of tokens to generate.
+        do_sample (`bool`, *optional*, defaults to `True`):
+            Whether to sample outputs during generation.
+        num_beams (`int`, *optional*, defaults to `1`):
+            Number of beams for beam search.
+        temperature (`float`, *optional*, defaults to `1.0`):
+            Temperature parameter for generation.
+        top_k (`int`, *optional*, defaults to `50`):
+            Value of k for top-k sampling.
+        top_p (`float`, *optional*, defaults to `1.0`):
+            Value of p for nucleus sampling.
+        repetition_penalty (`float`, *optional*, defaults to `1.0`):
+            Repetition penalty.
+        eos_tokens (`str` or `None`, *optional*, defaults to `None`):
+            EOS tokens to stop the generation. If multiple they should be comma separated.
+        eos_token_ids (`str` or `None`, *optional*, defaults to `None`):
+            EOS token IDs to stop the generation. If multiple they should be comma separated.
+        model_revision (`str`, *optional*, defaults to `"main"`):
+            Specific model version to use (can be a branch name, tag name or commit id).
+        torch_dtype (`str` or `None`, *optional*, defaults to `None`):
+            Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype
+            will be automatically derived from the model's weights.
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether to trust remote code when loading a model.
+        attn_implementation (`str` or `None`, *optional*, defaults to `None`):
+            Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case
+            you must install this manually by running `pip install flash-attn --no-build-isolation`.
+        load_in_8bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 8 bit precision for the base model - works only with LoRA.
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 4 bit precision for the base model - works only with LoRA.
+        bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`):
+            Quantization type.
+        use_bnb_nested_quant (`bool`, *optional*, defaults to `False`):
+            Whether to use nested quantization.
+    """
+
+    # General settings
+    model_name_or_path: str = field(metadata={"help": "Name of the pre-trained model."})
+    user: Optional[str] = field(default=None, metadata={"help": "Username to display in chat interface."})
+    system_prompt: Optional[str] = field(default=None, metadata={"help": "System prompt."})
+    save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history."})
+    device: str = field(default="cpu", metadata={"help": "Device to use for inference."})
+    examples_path: Optional[str] = field(default=None, metadata={"help": "Path to a yaml file with examples."})
+
+    # Generation settings
+    max_new_tokens: int = field(default=256, metadata={"help": "Maximum number of tokens to generate."})
+    do_sample: bool = field(default=True, metadata={"help": "Whether to sample outputs during generation."})
+    num_beams: int = field(default=1, metadata={"help": "Number of beams for beam search."})
+    temperature: float = field(default=1.0, metadata={"help": "Temperature parameter for generation."})
+    top_k: int = field(default=50, metadata={"help": "Value of k for top-k sampling."})
+    top_p: float = field(default=1.0, metadata={"help": "Value of p for nucleus sampling."})
+    repetition_penalty: float = field(default=1.0, metadata={"help": "Repetition penalty."})
+    eos_tokens: Optional[str] = field(
         default=None,
-        metadata={"help": "EOS tokens to stop the generation. If multiple they should be comma separated"},
+        metadata={"help": "EOS tokens to stop the generation. If multiple they should be comma separated."},
     )
-    eos_token_ids: str = field(
+    eos_token_ids: Optional[str] = field(
         default=None,
-        metadata={"help": "EOS token IDs to stop the generation. If multiple they should be comma separated"},
+        metadata={"help": "EOS token IDs to stop the generation. If multiple they should be comma separated."},
     )
-    # model loading
+
+    # Model loading
     model_revision: str = field(
         default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+        metadata={"help": "Specific model version to use (can be a branch name, tag name or commit id)."},
     )
-    torch_dtype: str = field(
+    torch_dtype: Optional[str] = field(
         default=None,
         metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
+            "help": "Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, "
+            "the dtype will be automatically derived from the model's weights.",
             "choices": ["auto", "bfloat16", "float16", "float32"],
         },
     )
-    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
-    attn_implementation: str = field(
+    trust_remote_code: bool = field(
+        default=False, metadata={"help": "Whether to trust remote code when loading a model."}
+    )
+    attn_implementation: Optional[str] = field(
         default=None,
         metadata={
-            "help": (
-                "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`"
-            )
+            "help": "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in "
+            "which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
         },
     )
     load_in_8bit: bool = field(
         default=False,
-        metadata={"help": "use 8 bit precision for the base model - works only with LoRA"},
+        metadata={"help": "Whether to use 8 bit precision for the base model - works only with LoRA."},
     )
     load_in_4bit: bool = field(
         default=False,
-        metadata={"help": "use 4 bit precision for the base model - works only with LoRA"},
+        metadata={"help": "Whether to use 4 bit precision for the base model - works only with LoRA."},
     )
-
-    bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"})
-    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+    bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]})
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "Whether to use nested quantization."})
 
 
 class RichInterface:
diff --git a/trl/scripts/utils.py b/trl/scripts/utils.py
index e386a19d37..a381ffe300 100644
--- a/trl/scripts/utils.py
+++ b/trl/scripts/utils.py
@@ -18,7 +18,7 @@
 import os
 import subprocess
 import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Iterable, Optional, Union
 
 import yaml
@@ -44,18 +44,34 @@ class ScriptArguments:
         dataset_test_split (`str`, *optional*, defaults to `"test"`):
             Dataset split to use for evaluation.
         gradient_checkpointing_use_reentrant (`bool`, *optional*, defaults to `False`):
-            Whether to apply `use_reentrant` for gradient_checkpointing.
+            Whether to apply `use_reentrant` for gradient checkpointing.
         ignore_bias_buffers (`bool`, *optional*, defaults to `False`):
             Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid scalar
             type, inplace operation. See https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992.
     """
 
-    dataset_name: str
-    dataset_config: Optional[str] = None
-    dataset_train_split: str = "train"
-    dataset_test_split: str = "test"
-    gradient_checkpointing_use_reentrant: bool = False
-    ignore_bias_buffers: bool = False
+    dataset_name: str = field(metadata={"help": "Dataset name."})
+    dataset_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Dataset configuration name. Corresponds to the `name` argument of the `datasets.load_dataset` "
+            "function."
+        },
+    )
+    dataset_train_split: str = field(default="train", metadata={"help": "Dataset split to use for training."})
+    dataset_test_split: str = field(default="test", metadata={"help": "Dataset split to use for evaluation."})
+    gradient_checkpointing_use_reentrant: bool = field(
+        default=False,
+        metadata={"help": "Whether to apply `use_reentrant` for gradient checkpointing."},
+    )
+    ignore_bias_buffers: bool = field(
+        default=False,
+        metadata={
+            "help": "Debug argument for distributed training. Fix for DDP issues with LM bias/mask buffers - invalid "
+            "scalar type, inplace operation. See "
+            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992."
+        },
+    )
 
 
 def init_zero_verbose():
diff --git a/trl/trainer/alignprop_config.py b/trl/trainer/alignprop_config.py
index 1c4faa963e..6cfc061d31 100644
--- a/trl/trainer/alignprop_config.py
+++ b/trl/trainer/alignprop_config.py
@@ -15,7 +15,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Any, Literal, Optional
+from typing import Any, Optional
 
 from transformers import is_bitsandbytes_available
 
@@ -36,7 +36,9 @@ class AlignPropConfig:
             Name of this experiment (defaults to the file name without the extension).
         run_name (`str`, *optional*, defaults to `""`):
             Name of this run.
-        log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`):
+        seed (`int`, *optional*, defaults to `0`):
+            Random seed for reproducibility.
+        log_with (`str` or `None`, *optional*, defaults to `None`):
             Log with either `"wandb"` or `"tensorboard"`. Check
             [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details.
         log_image_freq (`int`, *optional*, defaults to `1`):
@@ -69,6 +71,8 @@ class AlignPropConfig:
             Eta parameter for the DDIM sampler.
         sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
             Classifier-free guidance weight.
+        train_batch_size (`int`, *optional*, defaults to `1`):
+            Batch size for training.
         train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
             Whether to use the 8bit Adam optimizer from `bitsandbytes`.
         train_learning_rate (`float`, *optional*, defaults to `1e-3`):
@@ -85,7 +89,7 @@ class AlignPropConfig:
             Number of gradient accumulation steps.
         train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
             Maximum gradient norm for gradient clipping.
-        negative_prompts (`Optional[str]`, *optional*, defaults to `None`):
+        negative_prompts (`str` or `None`, *optional*, defaults to `None`):
             Comma-separated list of prompts to use as negative examples.
         truncated_backprop_rand (`bool`, *optional*, defaults to `True`):
             If `True`, randomized truncation to different diffusion timesteps is used.
@@ -97,39 +101,82 @@ class AlignPropConfig:
             Whether to push the final model to the Hub.
     """
 
-    exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
-    run_name: str = ""
-    seed: int = 0
-    log_with: Optional[Literal["wandb", "tensorboard"]] = None
-    log_image_freq: int = 1
-    tracker_kwargs: dict[str, Any] = field(default_factory=dict)
-    accelerator_kwargs: dict[str, Any] = field(default_factory=dict)
-    project_kwargs: dict[str, Any] = field(default_factory=dict)
-    tracker_project_name: str = "trl"
-    logdir: str = "logs"
-    num_epochs: int = 100
-    save_freq: int = 1
-    num_checkpoint_limit: int = 5
-    mixed_precision: str = "fp16"
-    allow_tf32: bool = True
-    resume_from: str = ""
-    sample_num_steps: int = 50
-    sample_eta: float = 1.0
-    sample_guidance_scale: float = 5.0
-    train_batch_size: int = 1
-    train_use_8bit_adam: bool = False
-    train_learning_rate: float = 1e-3
-    train_adam_beta1: float = 0.9
-    train_adam_beta2: float = 0.999
-    train_adam_weight_decay: float = 1e-4
-    train_adam_epsilon: float = 1e-8
-    train_gradient_accumulation_steps: int = 1
-    train_max_grad_norm: float = 1.0
-    negative_prompts: Optional[str] = None
-    truncated_backprop_rand: bool = True
-    truncated_backprop_timestep: int = 49
-    truncated_rand_backprop_minmax: tuple[int, int] = (0, 50)
-    push_to_hub: bool = False
+    exp_name: str = field(
+        default=os.path.basename(sys.argv[0])[: -len(".py")],
+        metadata={"help": "Name of this experiment (defaults to the file name without the extension)."},
+    )
+    run_name: str = field(default="", metadata={"help": "Name of this run."})
+    seed: int = field(default=0, metadata={"help": "Random seed for reproducibility."})
+    log_with: Optional[str] = field(
+        default=None,
+        metadata={"help": "Log with either 'wandb' or 'tensorboard'.", "choices": ["wandb", "tensorboard"]},
+    )
+    log_image_freq: int = field(default=1, metadata={"help": "Frequency for logging images."})
+    tracker_kwargs: dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments for the tracker (e.g., `wandb_project`)."},
+    )
+    accelerator_kwargs: dict[str, Any] = field(
+        default_factory=dict, metadata={"help": "Keyword arguments for the accelerator."}
+    )
+    project_kwargs: dict[str, Any] = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments for the accelerator project config (e.g., `logging_dir`)."},
+    )
+    tracker_project_name: str = field(default="trl", metadata={"help": "Name of project to use for tracking."})
+    logdir: str = field(default="logs", metadata={"help": "Top-level logging directory for checkpoint saving."})
+    num_epochs: int = field(default=100, metadata={"help": "Number of epochs to train."})
+    save_freq: int = field(default=1, metadata={"help": "Number of epochs between saving model checkpoints."})
+    num_checkpoint_limit: int = field(
+        default=5, metadata={"help": "Number of checkpoints to keep before overwriting old ones."}
+    )
+    mixed_precision: str = field(
+        default="fp16",
+        metadata={
+            "help": "Mixed precision training. Possible values are 'fp16', 'bf16', 'none'.",
+            "choices": ["fp16", "bf16", "none"],
+        },
+    )
+    allow_tf32: bool = field(default=True, metadata={"help": "Allow `tf32` on Ampere GPUs."})
+    resume_from: str = field(default="", metadata={"help": "Path to resume training from a checkpoint."})
+    sample_num_steps: int = field(default=50, metadata={"help": "Number of sampler inference steps."})
+    sample_eta: float = field(default=1.0, metadata={"help": "Eta parameter for the DDIM sampler."})
+    sample_guidance_scale: float = field(default=5.0, metadata={"help": "Classifier-free guidance weight."})
+    train_batch_size: int = field(default=1, metadata={"help": "Batch size for training."})
+    train_use_8bit_adam: bool = field(
+        default=False, metadata={"help": "Whether to use the 8bit Adam optimizer from `bitsandbytes`."}
+    )
+    train_learning_rate: float = field(default=1e-3, metadata={"help": "Learning rate."})
+    train_adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer."})
+    train_adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer."})
+    train_adam_weight_decay: float = field(default=1e-4, metadata={"help": "Weight decay for Adam optimizer."})
+    train_adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon value for Adam optimizer."})
+    train_gradient_accumulation_steps: int = field(
+        default=1, metadata={"help": "Number of gradient accumulation steps."}
+    )
+    train_max_grad_norm: float = field(default=1.0, metadata={"help": "Maximum gradient norm for gradient clipping."})
+    negative_prompts: Optional[str] = field(
+        default=None,
+        metadata={"help": "Comma-separated list of prompts to use as negative examples."},
+    )
+    truncated_backprop_rand: bool = field(
+        default=True,
+        metadata={"help": "If `True`, randomized truncation to different diffusion timesteps is used."},
+    )
+    truncated_backprop_timestep: int = field(
+        default=49,
+        metadata={
+            "help": "Absolute timestep to which the gradients are backpropagated. Used only if "
+            "`truncated_backprop_rand=False`."
+        },
+    )
+    truncated_rand_backprop_minmax: tuple[int, int] = field(
+        default=(0, 50),
+        metadata={
+            "help": "Range of diffusion timesteps for randomized truncated backpropagation.",
+        },
+    )
+    push_to_hub: bool = field(default=False, metadata={"help": "Whether to push the final model to the Hub."})
 
     def to_dict(self):
         output_dict = {}
diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py
index 5163080112..b6a5db07b0 100644
--- a/trl/trainer/bco_config.py
+++ b/trl/trainer/bco_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 
 from transformers import TrainingArguments
@@ -28,12 +28,12 @@ class BCOConfig(TrainingArguments):
     command line.
 
     Parameters:
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_prompt_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the completion. This argument is required if you want to use the default data collator
             and your model is an encoder-decoder.
         beta (`float`, *optional*, defaults to `0.1`):
@@ -41,7 +41,7 @@ class BCOConfig(TrainingArguments):
             reference model.
         label_pad_token_id (`int`,  *optional*, defaults to `-100`):
             Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+        padding_value (`int` or `None`, *optional*, defaults to `None`):
             Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
             Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
@@ -51,19 +51,19 @@ class BCOConfig(TrainingArguments):
         generate_during_eval (`bool`, *optional*, defaults to `False`):
             If `True`, generates and logs completions from both the model and the reference model to W&B or Comet during
             evaluation.
-        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+        is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
             Whether to precompute reference model log probabilities for training and evaluation datasets. This is
             useful when training without the reference model to reduce the total GPU memory needed.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         prompt_sample_size (`int`, *optional*, defaults to `1024`):
             Number of prompts that are fed to density ratio classifier.
@@ -73,20 +73,106 @@ class BCOConfig(TrainingArguments):
             Maximum value of the density ratio. The estimated density ratio is clamped to this value.
     """
 
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    beta: float = 0.1
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    disable_dropout: bool = True
-    generate_during_eval: bool = False
-    is_encoder_decoder: Optional[bool] = None
-    precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[dict[str, Any]] = None
-    dataset_num_proc: Optional[int] = None
-    prompt_sample_size: int = 1024
-    min_density_ratio: float = 0.5
-    max_density_ratio: float = 10.0
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch. "
+            "This argument is required if you want to use the default data collator."
+        },
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the prompt. "
+            "This argument is required if you want to use the default data collator."
+        },
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the "
+            "default data collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. "
+            "Higher β means less deviation from the reference model."
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: Optional[int] = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long. Possible values are "
+            "`keep_end` or `keep_start`. This argument is required if you want to use the "
+            "default data collator."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model and reference model."},
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model "
+            "to W&B during evaluation."
+        },
+    )
+    is_encoder_decoder: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the "
+            "`model` argument, you need to specify if the model returned by the callable is an "
+            "encoder-decoder model."
+        },
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory "
+            "needed."
+        },
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "model from a string."
+        },
+    )
+    ref_model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "reference model from a string."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    prompt_sample_size: int = field(
+        default=1024,
+        metadata={"help": "Number of prompts that are fed to density ratio classifier."},
+    )
+    min_density_ratio: float = field(
+        default=0.5,
+        metadata={"help": "Minimum value of the density ratio. The estimated density ratio is clamped to this value."},
+    )
+    max_density_ratio: float = field(
+        default=10.0,
+        metadata={"help": "Maximum value of the density ratio. The estimated density ratio is clamped to this value."},
+    )
diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py
index a451d7c09c..65b1187466 100644
--- a/trl/trainer/cpo_config.py
+++ b/trl/trainer/cpo_config.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Any, Literal, Optional
+from dataclasses import dataclass, field
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -31,12 +31,12 @@ class CPOConfig(TrainingArguments):
         learning_rate (`float`, *optional*, defaults to `1e-6`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_prompt_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the completion. This argument is required if you want to use the default data collator
             and your model is an encoder-decoder.
         beta (`float`, *optional*, defaults to `0.1`):
@@ -61,37 +61,109 @@ class CPOConfig(TrainingArguments):
             Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
         label_pad_token_id (`int`, *optional*, defaults to `-100`):
             Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+        padding_value (`int` or `None`, *optional*, defaults to `None`):
             Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
             Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
             This argument is required if you want to use the default data collator.
         generate_during_eval (`bool`, *optional*, defaults to `False`):
             If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
-        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+        is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
     """
 
-    learning_rate: float = 1e-6
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    beta: float = 0.1
-    label_smoothing: float = 0.0
-    loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid"
-    disable_dropout: bool = True
-    cpo_alpha: float = 1.0
-    simpo_gamma: float = 0.5
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    generate_during_eval: bool = False
-    is_encoder_decoder: Optional[bool] = None
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    dataset_num_proc: Optional[int] = None
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`transformers.TrainingArguments`."
+        },
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "Label smoothing factor."},
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "hinge", "ipo", "simpo"],
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    cpo_alpha: float = field(
+        default=1.0,
+        metadata={"help": "Weight of the BC regularizer in CPO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={"help": "Target reward margin for the SimPO loss, used only when the `loss_type='simpo'`."},
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={"help": "Label pad token id."},
+    )
+    padding_value: Optional[int] = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
+    )
+    is_encoder_decoder: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether the model is an encoder-decoder model."},
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
diff --git a/trl/trainer/ddpo_config.py b/trl/trainer/ddpo_config.py
index ca703eb806..06ba94a48e 100644
--- a/trl/trainer/ddpo_config.py
+++ b/trl/trainer/ddpo_config.py
@@ -15,7 +15,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Optional
 
 from transformers import is_bitsandbytes_available
 
@@ -38,7 +38,7 @@ class DDPOConfig:
             Name of this run.
         seed (`int`, *optional*, defaults to `0`):
             Random seed.
-        log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`):
+        log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`):
             Log with either 'wandb' or 'tensorboard', check
             https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
         tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
@@ -94,7 +94,7 @@ class DDPOConfig:
         train_num_inner_epochs (`int`, *optional*, defaults to `1`):
             Number of inner epochs per outer epoch.
         train_cfg (`bool`, *optional*, defaults to `True`):
-            Whether or not to use classifier-free guidance during training.
+            Whether to use classifier-free guidance during training.
         train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
             Clip advantages to the range.
         train_clip_range (`float`, *optional*, defaults to `1e-4`):
@@ -111,53 +111,179 @@ class DDPOConfig:
             Whether to compute rewards asynchronously.
         max_workers (`int`, *optional*, defaults to `2`):
             Maximum number of workers to use for async reward computation.
-        negative_prompts (`Optional[str]`, *optional*, defaults to `""`):
+        negative_prompts (`str`, *optional*, defaults to `""`):
             Comma-separated list of prompts to use as negative examples.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the final model checkpoint to the Hub.
     """
 
-    exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
-    run_name: str = ""
-    seed: int = 0
-    log_with: Optional[Literal["wandb", "tensorboard"]] = None
-    tracker_kwargs: dict = field(default_factory=dict)
-    accelerator_kwargs: dict = field(default_factory=dict)
-    project_kwargs: dict = field(default_factory=dict)
-    tracker_project_name: str = "trl"
-    logdir: str = "logs"
-    num_epochs: int = 100
-    save_freq: int = 1
-    num_checkpoint_limit: int = 5
-    mixed_precision: str = "fp16"
-    allow_tf32: bool = True
-    resume_from: str = ""
-    sample_num_steps: int = 50
-    sample_eta: float = 1.0
-    sample_guidance_scale: float = 5.0
-    sample_batch_size: int = 1
-    sample_num_batches_per_epoch: int = 2
-    train_batch_size: int = 1
-    train_use_8bit_adam: bool = False
-    train_learning_rate: float = 3e-4
-    train_adam_beta1: float = 0.9
-    train_adam_beta2: float = 0.999
-    train_adam_weight_decay: float = 1e-4
-    train_adam_epsilon: float = 1e-8
-    train_gradient_accumulation_steps: int = 1
-    train_max_grad_norm: float = 1.0
-    train_num_inner_epochs: int = 1
-    train_cfg: bool = True
-    train_adv_clip_max: float = 5.0
-    train_clip_range: float = 1e-4
-    train_timestep_fraction: float = 1.0
-    per_prompt_stat_tracking: bool = False
-    per_prompt_stat_tracking_buffer_size: int = 16
-    per_prompt_stat_tracking_min_count: int = 16
-    async_reward_computation: bool = False
-    max_workers: int = 2
-    negative_prompts: str = ""
-    push_to_hub: bool = False
+    exp_name: str = field(
+        default=os.path.basename(sys.argv[0])[: -len(".py")],
+        metadata={"help": "Name of this experiment (by default is the file name without the extension name)."},
+    )
+    run_name: str = field(
+        default="",
+        metadata={"help": "Name of this run."},
+    )
+    seed: int = field(
+        default=0,
+        metadata={"help": "Random seed."},
+    )
+    log_with: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Log with either 'wandb' or 'tensorboard'.",
+            "choices": ["wandb", "tensorboard"],
+        },
+    )
+    tracker_kwargs: dict = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments for the tracker (e.g. wandb_project)."},
+    )
+    accelerator_kwargs: dict = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments for the accelerator."},
+    )
+    project_kwargs: dict = field(
+        default_factory=dict,
+        metadata={"help": "Keyword arguments for the accelerator project config (e.g. `logging_dir`)."},
+    )
+    tracker_project_name: str = field(
+        default="trl",
+        metadata={"help": "Name of project to use for tracking."},
+    )
+    logdir: str = field(
+        default="logs",
+        metadata={"help": "Top-level logging directory for checkpoint saving."},
+    )
+    num_epochs: int = field(
+        default=100,
+        metadata={"help": "Number of epochs to train."},
+    )
+    save_freq: int = field(
+        default=1,
+        metadata={"help": "Number of epochs between saving model checkpoints."},
+    )
+    num_checkpoint_limit: int = field(
+        default=5,
+        metadata={"help": "Number of checkpoints to keep before overwriting old ones."},
+    )
+    mixed_precision: str = field(
+        default="fp16",
+        metadata={"help": "Mixed precision training."},
+    )
+    allow_tf32: bool = field(
+        default=True,
+        metadata={"help": "Allow `tf32` on Ampere GPUs."},
+    )
+    resume_from: str = field(
+        default="",
+        metadata={"help": "Resume training from a checkpoint."},
+    )
+    sample_num_steps: int = field(
+        default=50,
+        metadata={"help": "Number of sampler inference steps."},
+    )
+    sample_eta: float = field(
+        default=1.0,
+        metadata={"help": "Eta parameter for the DDIM sampler."},
+    )
+    sample_guidance_scale: float = field(
+        default=5.0,
+        metadata={"help": "Classifier-free guidance weight."},
+    )
+    sample_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size (per GPU) to use for sampling."},
+    )
+    sample_num_batches_per_epoch: int = field(
+        default=2,
+        metadata={"help": "Number of batches to sample per epoch."},
+    )
+    train_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size (per GPU) to use for training."},
+    )
+    train_use_8bit_adam: bool = field(
+        default=False,
+        metadata={"help": "Use 8bit Adam optimizer from bitsandbytes."},
+    )
+    train_learning_rate: float = field(
+        default=3e-4,
+        metadata={"help": "Learning rate."},
+    )
+    train_adam_beta1: float = field(
+        default=0.9,
+        metadata={"help": "Adam beta1."},
+    )
+    train_adam_beta2: float = field(
+        default=0.999,
+        metadata={"help": "Adam beta2."},
+    )
+    train_adam_weight_decay: float = field(
+        default=1e-4,
+        metadata={"help": "Adam weight decay."},
+    )
+    train_adam_epsilon: float = field(
+        default=1e-8,
+        metadata={"help": "Adam epsilon."},
+    )
+    train_gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={"help": "Number of gradient accumulation steps."},
+    )
+    train_max_grad_norm: float = field(
+        default=1.0,
+        metadata={"help": "Maximum gradient norm for gradient clipping."},
+    )
+    train_num_inner_epochs: int = field(
+        default=1,
+        metadata={"help": "Number of inner epochs per outer epoch."},
+    )
+    train_cfg: bool = field(
+        default=True,
+        metadata={"help": "Whether to use classifier-free guidance during training."},
+    )
+    train_adv_clip_max: float = field(
+        default=5.0,
+        metadata={"help": "Clip advantages to the range."},
+    )
+    train_clip_range: float = field(
+        default=1e-4,
+        metadata={"help": "PPO clip range."},
+    )
+    train_timestep_fraction: float = field(
+        default=1.0,
+        metadata={"help": "Fraction of timesteps to train on."},
+    )
+    per_prompt_stat_tracking: bool = field(
+        default=False,
+        metadata={"help": "Whether to track statistics for each prompt separately."},
+    )
+    per_prompt_stat_tracking_buffer_size: int = field(
+        default=16,
+        metadata={"help": "Number of reward values to store in the buffer for each prompt."},
+    )
+    per_prompt_stat_tracking_min_count: int = field(
+        default=16,
+        metadata={"help": "Minimum number of reward values to store in the buffer."},
+    )
+    async_reward_computation: bool = field(
+        default=False,
+        metadata={"help": "Whether to compute rewards asynchronously."},
+    )
+    max_workers: int = field(
+        default=2,
+        metadata={"help": "Maximum number of workers to use for async reward computation."},
+    )
+    negative_prompts: str = field(
+        default="",
+        metadata={"help": "Comma-separated list of prompts to use as negative examples."},
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the final model checkpoint to the Hub."},
+    )
 
     def to_dict(self):
         output_dict = {}
diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index 966ea7079c..9c98f9e910 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Literal, Optional
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -67,20 +67,20 @@ class DPOConfig(TrainingArguments):
                 - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
                 - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
         use_weighting (`bool`, *optional*, defaults to `False`):
-            Whether or not to weight the loss as done in the [WPO](https://huggingface.co/papers/2406.11827) paper.
+            Whether to weight the loss as done in the [WPO](https://huggingface.co/papers/2406.11827) paper.
         label_pad_token_id (`int`, *optional*, defaults to `-100`):
             Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+        padding_value (`int` or `None`, *optional*, defaults to `None`):
             Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
             Truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the
             default data collator.
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_prompt_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the target. This argument is required if you want to use the default data collator and
             your model is an encoder-decoder.
         is_encoder_decoder(`Optional[int]`, *optional*, defaults to `None`):
@@ -94,21 +94,21 @@ class DPOConfig(TrainingArguments):
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
             Whether to precompute reference model log probabilities for training and evaluation datasets. This is
             useful when training without the reference model to reduce the total GPU memory needed.
-        precompute_ref_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        precompute_ref_batch_size (`int` or `None`, *optional*, defaults to `None`):
             Batch size to use when precomputing reference model log probabilities. This can be set higher than the
             training batch size to speed up preprocessing. If `None`, defaults to `per_device_train_batch_size` for
             training and `per_device_eval_batch_size` for evaluation.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
-        model_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
+        model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
             Name of the train target PEFT adapter, when using LoRA with multiple adapters.
-        ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
+        ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
             Name of the reference PEFT adapter, when using LoRA with multiple adapters.
         reference_free (`bool`, *optional*, defaults to `False`):
             If `True`, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal
@@ -141,54 +141,217 @@ class DPOConfig(TrainingArguments):
             τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls
             the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`.
         use_num_logits_to_keep (`bool`, *optional*, defaults to `False`):
-            If `True`, only a specified number of logits are computed in the forward pass of CausalLM. This can be useful
-            for saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios
-            when working with very long prompts where labels are -ignored (-100).
+            If `True`, only a specified number of logits are computed in the forward pass of CausalLM. This can be
+            useful for saving memory and speeding up training by not computing the logits for all tokens, especially in
+            scenarios when working with very long prompts where labels are ignored (-100).
             [Read more](https://huggingface.co/docs/transformers/main/model_doc/llama#transformers.LlamaForCausalLM)
     """
 
-    learning_rate: float = 1e-6
-    beta: float = 0.1
-    label_smoothing: float = 0.0
-    loss_type: Literal[
-        "sigmoid",
-        "hinge",
-        "ipo",
-        "exo_pair",
-        "nca_pair",
-        "robust",
-        "bco_pair",
-        "sppo_hard",
-        "aot",
-        "aot_pair",
-        "discopop",
-        "apo_zero",
-        "apo_down",
-    ] = "sigmoid"
-    use_weighting: bool = False
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    is_encoder_decoder: Optional[bool] = None
-    disable_dropout: bool = True
-    generate_during_eval: bool = False
-    precompute_ref_log_probs: bool = False
-    precompute_ref_batch_size: Optional[int] = None
-    dataset_num_proc: Optional[int] = None
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[dict[str, Any]] = None
-    model_adapter_name: Optional[str] = None
-    ref_adapter_name: Optional[str] = None
-    reference_free: bool = False
-    force_use_ref_model: bool = False
-    f_divergence_type: FDivergenceType = FDivergenceType.REVERSE_KL
-    f_alpha_divergence_coef: float = 1.0
-    sync_ref_model: bool = False
-    ref_model_mixup_alpha: float = 0.9
-    ref_model_sync_steps: int = 64
-    rpo_alpha: Optional[float] = None
-    discopop_tau: float = 0.05
-    use_num_logits_to_keep: bool = False
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`transformers.TrainingArguments`."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. "
+            "Higher β means less deviation from the reference model."
+        },
+    )
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "Label smoothing factor."},
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": [
+                "sigmoid",
+                "hinge",
+                "ipo",
+                "exo_pair",
+                "nca_pair",
+                "robust",
+                "bco_pair",
+                "sppo_hard",
+                "aot",
+                "aot_pair",
+                "discopop",
+                "apo_zero",
+                "apo_down",
+            ],
+        },
+    )
+    use_weighting: bool = field(
+        default=False,
+        metadata={"help": "Whether to weight the loss as done in the WPO paper."},
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: Optional[int] = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long. This argument is required if you want to use "
+            "the default data collator.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    is_encoder_decoder: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the "
+            "`model` argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model and reference model."},
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model "
+            "to W&B during evaluation."
+        },
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory needed."
+        },
+    )
+    precompute_ref_batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Batch size to use when precomputing reference model log probabilities. This can be set higher "
+            "than the training batch size to speed up preprocessing. If `None`, defaults to "
+            "`per_device_train_batch_size` for training and `per_device_eval_batch_size` for evaluation."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "model from a string."
+        },
+    )
+    ref_model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "reference model from a string."
+        },
+    )
+    model_adapter_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    ref_adapter_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    reference_free: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, we ignore the _provided_ reference model and implicitly use a reference model that "
+            "assigns equal probability to all responses."
+        },
+    )
+    force_use_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "In case one passes a PEFT model for the active model and you want to use a different model for "
+            "the ref_model, set this flag to `True`."
+        },
+    )
+    f_divergence_type: FDivergenceType = field(
+        default=FDivergenceType.REVERSE_KL,
+        metadata={
+            "help": "Type of f-divergence regularization function to compute divergence between policy and reference "
+            "model."
+        },
+    )
+    f_alpha_divergence_coef: float = field(
+        default=1.0,
+        metadata={"help": "α coefficient in the α-divergence u^-α regularization function for DPO loss."},
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "When set to `True`, the reference model is synchronized with the active model every "
+            "`ref_model_sync_steps` steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.9,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`"
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=64,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy."
+        },
+    )
+    rpo_alpha: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "α parameter from the RPO paper (v3), which controls the weighting of the NLL term in the loss. "
+            "If `None`, no weighting is applied and the loss is the same as the DPO loss. The paper recommends "
+            "`rpo_alpha=1.0`."
+        },
+    )
+    discopop_tau: float = field(
+        default=0.05,
+        metadata={
+            "help": "τ/temperature parameter from the DiscoPOP paper, which controls the shape of log ratio modulated "
+            "loss. The paper recommends the default value `discopop_tau=0.05`."
+        },
+    )
+    use_num_logits_to_keep: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, only a specified number of logits are computed in the forward pass of CausalLM. "
+            "This can be useful for saving memory and speeding up training by not computing the logits for all "
+            "tokens, especially in scenarios when working with very long prompts where labels are ignored (-100)."
+        },
+    )
diff --git a/trl/trainer/gkd_config.py b/trl/trainer/gkd_config.py
index e110b047d1..d55bec4b06 100644
--- a/trl/trainer/gkd_config.py
+++ b/trl/trainer/gkd_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 
 from .sft_config import SFTConfig
@@ -21,7 +21,7 @@
 @dataclass
 class GKDConfig(SFTConfig):
     """
-    Configuration class for GKDTrainer.
+    Configuration class for [`GKDTrainer`].
 
     Args:
         temperature (`float`, *optional*, defaults to `0.9`):
@@ -34,10 +34,10 @@ class GKDConfig(SFTConfig):
             beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
         max_new_tokens (`int`, *optional*, defaults to `128`):
             Maximum number of tokens to generate per completion.
-        teacher_model_name_or_path (`Optional[str]`, *optional*, defaults to `None`):
+        teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
             Model name or path of the teacher model. If `None`, the teacher model will be the same as the model
             being trained.
-        teacher_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
             from a string.
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -47,14 +47,54 @@ class GKDConfig(SFTConfig):
             on teacher-generated output).
     """
 
-    temperature: float = 0.9
-    lmbda: float = 0.5
-    beta: float = 0.5
-    max_new_tokens: int = 128
-    teacher_model_name_or_path: Optional[str] = None
-    teacher_model_init_kwargs: Optional[dict[str, Any]] = None
-    disable_dropout: bool = True
-    seq_kd: bool = False
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    lmbda: float = field(
+        default=0.5,
+        metadata={
+            "help": "Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy "
+            "student-generated outputs)."
+        },
+    )
+    beta: float = field(
+        default=0.5,
+        metadata={
+            "help": "Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence "
+            "loss. When beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL "
+            "Divergence."
+        },
+    )
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    teacher_model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Model name or path of the teacher model. If `None`, the teacher model will be the same as the "
+            "model being trained."
+        },
+    )
+    teacher_model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "teacher model from a string."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropouts in `model`."},
+    )
+    seq_kd: bool = field(
+        default=False,
+        metadata={
+            "help": "Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised "
+            "FT on teacher-generated output)."
+        },
+    )
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index c29491340d..6b28b51367 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -161,7 +161,7 @@ def judge(
 
         This base class should be used to implement binary evaluations as done in section 4.1.4 of the
         [CGPO paper](https://huggingface.co/papers/2409.20370).
-        It is relevant for assessing whether or not a prompt completion pair satisfies a specific contraint.
+        It is relevant for assessing whether a prompt completion pair satisfies a specific contraint.
 
         Args:
             prompts (`list[str]`): List of prompts.
diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
index 82926a6aec..5f54ef51d7 100644
--- a/trl/trainer/kto_config.py
+++ b/trl/trainer/kto_config.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Any, Literal, Optional
+from dataclasses import dataclass, field
+from typing import Any, Optional
 
 from transformers import TrainingArguments
 
@@ -31,12 +31,12 @@ class KTOConfig(TrainingArguments):
         learning_rate (`float`, *optional*, defaults to `5e-7`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_prompt_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the completion. This argument is required if you want to use the default data collator
             and your model is an encoder-decoder.
         beta (`float`, *optional*, defaults to `0.1`):
@@ -54,7 +54,7 @@ class KTOConfig(TrainingArguments):
             Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
         label_pad_token_id (`int`, *optional*, defaults to `-100`):
             Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+        padding_value (`int` or `None`, *optional*, defaults to `None`):
             Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
             Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
@@ -62,39 +62,134 @@ class KTOConfig(TrainingArguments):
         generate_during_eval (`bool`, *optional*, defaults to `False`):
             If `True`, generates and logs completions from both the model and the reference model to W&B or Comet during
             evaluation.
-        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+        is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
             Whether to precompute reference model log probabilities for training and evaluation datasets. This is
             useful when training without the reference model to reduce the total GPU memory needed.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        ref_model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
             from a string.
-        dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc: (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         disable_dropout (`bool`, *optional*, defaults to `True`):
             Whether to disable dropout in the model and reference model.
     """
 
-    learning_rate: float = 1e-6
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    beta: float = 0.1
-    loss_type: Literal["kto", "apo_zero_unpaired"] = "kto"
-    desirable_weight: float = 1.0
-    undesirable_weight: float = 1.0
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    generate_during_eval: bool = False
-    is_encoder_decoder: Optional[bool] = None
-    disable_dropout: bool = True
-    precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    ref_model_init_kwargs: Optional[dict[str, Any]] = None
-    dataset_num_proc: Optional[int] = None
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`transformers.TrainingArguments`."
+        },
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model."
+        },
+    )
+    loss_type: str = field(
+        default="kto",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["kto", "apo_zero_unpaired"],
+        },
+    )
+    desirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Desirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    undesirable_weight: float = field(
+        default=1.0,
+        metadata={
+            "help": "Undesirable losses are weighed by this factor to counter unequal number of desirable and "
+            "undesirable pairs.",
+        },
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: Optional[int] = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={
+            "help": "If `True`, generates and logs completions from both the model and the reference model to W&B "
+            "during evaluation."
+        },
+    )
+    is_encoder_decoder: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
+            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    precompute_ref_log_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to precompute reference model log probabilities for training and evaluation datasets. "
+            "This is useful when training without the reference model to reduce the total GPU memory needed."
+        },
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    ref_model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
+            "reference model from a string."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index 3261ff8a6a..ec9119f36b 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
-from typing import Literal, Optional
+from dataclasses import dataclass, field
+from typing import Optional
 
 
 @dataclass
@@ -26,11 +26,11 @@ class ModelConfig:
     command line.
 
     Parameters:
-        model_name_or_path (`Optional[str]`, *optional*, defaults to `None`):
+        model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
             Model checkpoint for weights initialization.
         model_revision (`str`, *optional*, defaults to `"main"`):
             Specific model version to use. It can be a branch name, a tag name, or a commit id.
-        torch_dtype (`Optional[Literal["auto", "bfloat16", "float16", "float32"]]`, *optional*, defaults to `None`):
+        torch_dtype (`Literal["auto", "bfloat16", "float16", "float32"]` or `None`, *optional*, defaults to `None`):
             Override the default `torch.dtype` and load the model under this dtype. Possible values are
 
                 - `"bfloat16"`: `torch.bfloat16`
@@ -42,7 +42,7 @@ class ModelConfig:
             Whether to allow for custom models defined on the Hub in their own modeling files. This option should only
             be set to `True` for repositories you trust and in which you have read the code, as it will execute code
             present on the Hub on your local machine.
-        attn_implementation (`Optional[str]`, *optional*, defaults to `None`):
+        attn_implementation (`str` or `None`, *optional*, defaults to `None`):
             Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case
             you must install this manually by running `pip install flash-attn --no-build-isolation`.
         use_peft (`bool`, *optional*, defaults to `False`):
@@ -53,9 +53,9 @@ class ModelConfig:
             LoRA alpha.
         lora_dropout (`float`, *optional*, defaults to `0.05`):
             LoRA dropout.
-        lora_target_modules (`Optional[Union[str, list[str]]]`, *optional*, defaults to `None`):
+        lora_target_modules (`Union[str, list[str]]` or `None`, *optional*, defaults to `None`):
             LoRA target modules.
-        lora_modules_to_save (`Optional[list[str]]`, *optional*, defaults to `None`):
+        lora_modules_to_save (`list[str]` or `None`, *optional*, defaults to `None`):
             Model layers to unfreeze & train.
         lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`):
             Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling).
@@ -72,27 +72,91 @@ class ModelConfig:
             Whether to use nested quantization.
     """
 
-    model_name_or_path: Optional[str] = None
-    model_revision: str = "main"
-    torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None
-    trust_remote_code: bool = False
-    attn_implementation: Optional[str] = None
-    use_peft: bool = False
-    lora_r: int = 16
-    lora_alpha: int = 32
-    lora_dropout: float = 0.05
-    lora_target_modules: Optional[list[str]] = None
-    lora_modules_to_save: Optional[list[str]] = None
-    lora_task_type: str = "CAUSAL_LM"
-    use_rslora: bool = False
-    load_in_8bit: bool = False
-    load_in_4bit: bool = False
-    bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4"
-    use_bnb_nested_quant: bool = False
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Model checkpoint for weights initialization."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "Specific model version to use. It can be a branch name, a tag name, or a commit id."},
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Override the default `torch.dtype` and load the model under this dtype.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to allow for custom models defined on the Hub in their own modeling files. This option "
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+            "execute code present on the Hub on your local machine."
+        },
+    )
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in "
+            "which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
+        },
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use PEFT for training."},
+    )
+    lora_r: int = field(
+        default=16,
+        metadata={"help": "LoRA R value."},
+    )
+    lora_alpha: int = field(
+        default=32,
+        metadata={"help": "LoRA alpha."},
+    )
+    lora_dropout: float = field(
+        default=0.05,
+        metadata={"help": "LoRA dropout."},
+    )
+    lora_target_modules: Optional[list[str]] = field(
+        default=None,
+        metadata={"help": "LoRA target modules."},
+    )
+    lora_modules_to_save: Optional[list[str]] = field(
+        default=None,
+        metadata={"help": "Model layers to unfreeze & train."},
+    )
+    lora_task_type: str = field(
+        default="CAUSAL_LM",
+        metadata={"help": "Task type to pass for LoRA (use 'SEQ_CLS' for reward modeling)."},
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, "
+            "instead of the original default value of `lora_alpha/r`."
+        },
+    )
+    load_in_8bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 8 bit precision for the base model. Works only with LoRA."},
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 4 bit precision for the base model. Works only with LoRA."},
+    )
+    bnb_4bit_quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]},
+    )
+    use_bnb_nested_quant: bool = field(
+        default=False,
+        metadata={"help": "Whether to use nested quantization."},
+    )
 
     def __post_init__(self):
         if self.load_in_8bit and self.load_in_4bit:
             raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
 
-        if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
+        if hasattr(self.lora_target_modules, "__len__") and len(self.lora_target_modules) == 1:
             self.lora_target_modules = self.lora_target_modules[0]
diff --git a/trl/trainer/nash_md_config.py b/trl/trainer/nash_md_config.py
index dadad01f03..c8395fd136 100644
--- a/trl/trainer/nash_md_config.py
+++ b/trl/trainer/nash_md_config.py
@@ -31,7 +31,14 @@ class NashMDConfig(OnlineDPOConfig):
             epochs.
     """
 
-    mixture_coef: list[float] = field(default_factory=lambda: [0.5])
+    mixture_coef: list[float] = field(
+        default_factory=lambda: [0.5],
+        metadata={
+            "help": "Logit mixture coefficient for the model and reference model. If a list of floats is provided "
+            "then the mixture coefficient is selected for each new epoch and the last coefficient is used for the "
+            "rest of the epochs."
+        },
+    )
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index 5e75ede883..08a5da542b 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Optional
 
 from transformers import TrainingArguments
 
@@ -31,15 +31,15 @@ class OnlineDPOConfig(TrainingArguments):
         learning_rate (`float`, *optional*, defaults to `5e-7`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
-        reward_model_path (`Optional[str]`, *optional*, defaults to `None`):
+        reward_model_path (`str` or `None`, *optional*, defaults to `None`):
             Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
-        judge (`Optional[str]`, *optional*, defaults to `None`):
+        judge (`str` or `None`, *optional*, defaults to `None`):
             Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
         max_new_tokens (`int`, *optional*, defaults to `64`):
             Maximum number of tokens to generate per completion.
         temperature (`float`, *optional*, defaults to `0.9`):
             Temperature for sampling. The higher the temperature, the more random the completions.
-        missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`):
+        missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
             Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage
             to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
             value.
@@ -54,22 +54,71 @@ class OnlineDPOConfig(TrainingArguments):
                 - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
                 - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
 
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         disable_dropout (`bool`, *optional*, defaults to `True`):
             Whether to disable dropout in the model and reference model.
     """
 
-    learning_rate: float = 5e-7
-    reward_model_path: Optional[str] = None
-    judge: Optional[str] = None
-    max_new_tokens: int = 64
-    temperature: float = 0.9
-    missing_eos_penalty: Optional[float] = None
-    beta: list[float] = field(default_factory=lambda: [0.1])
-    loss_type: Literal["sigmoid", "ipo"] = "sigmoid"
-    dataset_num_proc: Optional[int] = None
-    disable_dropout: bool = True
+    learning_rate: float = field(
+        default=5e-7,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "transformers.TrainingArguments."
+        },
+    )
+    reward_model_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both."
+        },
+    )
+    judge: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both."
+        },
+    )
+    max_new_tokens: int = field(
+        default=64,
+        metadata={"help": "Maximum number of tokens to generate per completion."},
+    )
+    temperature: float = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    missing_eos_penalty: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to "
+            "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be "
+            "a positive value."
+        },
+    )
+    beta: list[float] = field(
+        default_factory=lambda: [0.1],
+        metadata={
+            "help": "Parameter controlling the deviation from the reference model. Higher β means less deviation from "
+            "the reference model. For the IPO loss (`loss_type='ipo'`), β is the regularization parameter denoted by "
+            "τ in the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is "
+            "selected for each new epoch and the last β is used for the rest of the epochs."
+        },
+    )
+    loss_type: str = field(
+        default="sigmoid",
+        metadata={
+            "help": "Type of loss to use.",
+            "choices": ["sigmoid", "ipo"],
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
 
     def __post_init__(self):
         super().__post_init__()
diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py
index cd892f1d46..f9a55ff47a 100644
--- a/trl/trainer/orpo_config.py
+++ b/trl/trainer/orpo_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 
 from transformers import TrainingArguments
@@ -31,12 +31,12 @@ class ORPOConfig(TrainingArguments):
         learning_rate (`float`, *optional*, defaults to `1e-6`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
-        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_prompt_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the completion. This argument is required if you want to use the default data collator
             and your model is an encoder-decoder.
         beta (`float`, *optional*, defaults to `0.1`):
@@ -46,33 +46,95 @@ class ORPOConfig(TrainingArguments):
             Whether to disable dropout in the model.
         label_pad_token_id (`int`, *optional*, defaults to `-100`):
             Label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+        padding_value (`int` or `None`, *optional*, defaults to `None`):
             Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
             Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
             This argument is required if you want to use the default data collator.
         generate_during_eval (`bool`, *optional*, defaults to `False`):
             If `True`, generates and logs completions from the model to W&B or Comet during evaluation.
-        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+        is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
             you need to specify if the model returned by the callable is an encoder-decoder model.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
     """
 
-    learning_rate: float = 1e-6
-    max_length: Optional[int] = None
-    max_prompt_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    beta: float = 0.1
-    disable_dropout: bool = True
-    label_pad_token_id: int = -100
-    padding_value: Optional[int] = None
-    truncation_mode: str = "keep_end"
-    generate_during_eval: bool = False
-    is_encoder_decoder: Optional[bool] = None
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    dataset_num_proc: Optional[int] = None
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "transformers.TrainingArguments."
+        },
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) in the batch."},
+    )
+    max_prompt_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the prompt. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion. This argument is required if you want to use the default data "
+            "collator and your model is an encoder-decoder."
+        },
+    )
+    beta: float = field(
+        default=0.1,
+        metadata={
+            "help": "Parameter controlling the relative ratio loss weight in the ORPO loss. In the paper, it is "
+            "denoted by λ."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model."},
+    )
+    label_pad_token_id: int = field(
+        default=-100,
+        metadata={
+            "help": "Label pad token id. This argument is required if you want to use the default data collator."
+        },
+    )
+    padding_value: Optional[int] = field(
+        default=None,
+        metadata={"help": "Padding value to use. If `None`, the padding value of the tokenizer is used."},
+    )
+    truncation_mode: str = field(
+        default="keep_end",
+        metadata={
+            "help": "Truncation mode to use when the prompt is too long.",
+            "choices": ["keep_end", "keep_start"],
+        },
+    )
+    generate_during_eval: bool = field(
+        default=False,
+        metadata={"help": "If `True`, generates and logs completions from the model to W&B during evaluation."},
+    )
+    is_encoder_decoder: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "When using the `model_init` argument (callable) to instantiate the model instead of the `model` "
+            "argument, you need to specify if the model returned by the callable is an encoder-decoder model."
+        },
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index 62a3b0a33e..c98f8aedce 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from ..trainer.utils import OnPolicyConfig
@@ -33,9 +33,9 @@ class PPOConfig(OnPolicyConfig):
             Name of this experiment.
         reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
             Path to the reward model.
-        model_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
+        model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
             Name of the train target PEFT adapter, when using LoRA with multiple adapters.
-        ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
+        ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
             Name of the reference PEFT adapter, when using LoRA with multiple adapters.
         num_ppo_epochs (`int`, *optional*, defaults to `4`):
             Number of epochs to train.
@@ -55,15 +55,51 @@ class PPOConfig(OnPolicyConfig):
             Lambda value for GAE.
     """
 
-    exp_name: str = os.path.basename(__file__)[: -len(".py")]
-    reward_model_path: str = "EleutherAI/pythia-160m"
-    model_adapter_name: Optional[str] = None
-    ref_adapter_name: Optional[str] = None
-    num_ppo_epochs: int = 4
-    whiten_rewards: bool = False
-    kl_coef: float = 0.05
-    cliprange: float = 0.2
-    vf_coef: float = 0.1
-    cliprange_value: float = 0.2
-    gamma: float = 1.0
-    lam: float = 0.95
+    exp_name: str = field(
+        default=os.path.basename(__file__)[:-3],
+        metadata={"help": "Name of this experiment."},
+    )
+    reward_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the reward model."},
+    )
+    model_adapter_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the train target PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    ref_adapter_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the reference PEFT adapter, when using LoRA with multiple adapters."},
+    )
+    num_ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "Number of epochs to train."},
+    )
+    whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whether to whiten the rewards."},
+    )
+    kl_coef: float = field(
+        default=0.05,
+        metadata={"help": "KL coefficient."},
+    )
+    cliprange: float = field(
+        default=0.2,
+        metadata={"help": "Clip range."},
+    )
+    vf_coef: float = field(
+        default=0.1,
+        metadata={"help": "Value function coefficient."},
+    )
+    cliprange_value: float = field(
+        default=0.2,
+        metadata={"help": "Clip range for the value function."},
+    )
+    gamma: float = field(
+        default=1.0,
+        metadata={"help": "Discount factor."},
+    )
+    lam: float = field(
+        default=0.95,
+        metadata={"help": "Lambda value for GAE."},
+    )
diff --git a/trl/trainer/prm_config.py b/trl/trainer/prm_config.py
index 21a4fc5662..4a1046e2de 100644
--- a/trl/trainer/prm_config.py
+++ b/trl/trainer/prm_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from transformers import TrainingArguments
@@ -31,9 +31,9 @@ class PRMConfig(TrainingArguments):
         learning_rate (`float`, *optional*, defaults to `1e-5`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) used for truncation.
-        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_completion_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the completion used for truncation. The completion is the concatenation of the steps.
         disable_dropout (`bool`, *optional*, defaults to `True`):
             Whether to disable dropout in the model.
@@ -45,10 +45,37 @@ class PRMConfig(TrainingArguments):
             Number of processes to use for processing the dataset.
     """
 
-    learning_rate: float = 1e-5
-    max_length: Optional[int] = None
-    max_completion_length: Optional[int] = None
-    disable_dropout: bool = True
-    step_separator: str = "\n"
-    train_on_last_step_only: bool = False
-    dataset_num_proc: Optional[int] = None
+    learning_rate: float = field(
+        default=1e-5,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`TrainingArguments`."
+        },
+    )
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={"help": "Maximum length of the sequences (prompt + completion) used for truncation."},
+    )
+    max_completion_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the completion used for truncation. The completion is the concatenation of the "
+            "steps."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model and reference model."},
+    )
+    step_separator: str = field(
+        default="\n",
+        metadata={"help": "Separator used to separate each step of the reasoning process."},
+    )
+    train_on_last_step_only: bool = field(
+        default=False,
+        metadata={"help": "Whether to train only on the last step."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
diff --git a/trl/trainer/reward_config.py b/trl/trainer/reward_config.py
index 8018a2844c..e19cd1ca7c 100644
--- a/trl/trainer/reward_config.py
+++ b/trl/trainer/reward_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from transformers import TrainingArguments
@@ -28,7 +28,7 @@ class RewardConfig(TrainingArguments):
     command line.
 
     Parameters:
-        max_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
             to use the default data collator.
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -39,12 +39,36 @@ class RewardConfig(TrainingArguments):
             Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
             https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
         remove_unused_columns (`bool`, *optional*, defaults to `False`):
-            Whether or not to remove the columns that are not used by the model's forward pass. Can be `True` only if
+            Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if
             the dataset is pretokenized.
     """
 
-    max_length: Optional[int] = None
-    disable_dropout: bool = True
-    dataset_num_proc: Optional[int] = None
-    center_rewards_coefficient: Optional[float] = None
-    remove_unused_columns: bool = False
+    max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum length of the sequences (prompt + completion) in the batch. This argument is required if "
+            "you want to use the default data collator."
+        },
+    )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={"help": "Whether to disable dropout in the model and reference model."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    center_rewards_coefficient: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by "
+            "https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
+        },
+    )
+    remove_unused_columns: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to remove the columns that are not used by the model's forward pass. Can be `True` only "
+            "if the dataset is pretokenized."
+        },
+    )
diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
index e72b0fc02c..b0aa66bf5e 100644
--- a/trl/trainer/rloo_config.py
+++ b/trl/trainer/rloo_config.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from ..trainer.utils import OnPolicyConfig
 
@@ -44,10 +44,31 @@ class RLOOConfig(OnPolicyConfig):
             REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
     """
 
-    exp_name: str = os.path.basename(__file__)[: -len(".py")]
-    reward_model_path: str = "EleutherAI/pythia-160m"
-    num_ppo_epochs: int = 4
-    whiten_rewards: bool = False
-    kl_coef: float = 0.05
-    cliprange: float = 0.2
-    rloo_k: int = 2
+    exp_name: str = field(
+        default=os.path.basename(__file__)[:-3],
+        metadata={"help": "Name of this experiment."},
+    )
+    reward_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the reward model."},
+    )
+    num_ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "Number of epochs to train."},
+    )
+    whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whether to whiten the rewards."},
+    )
+    kl_coef: float = field(
+        default=0.05,
+        metadata={"help": "KL coefficient."},
+    )
+    cliprange: float = field(
+        default=0.2,
+        metadata={"help": "Clip range."},
+    )
+    rloo_k: int = field(
+        default=2,
+        metadata={"help": "REINFORCE Leave-One-Out (RLOO) number of online samples per prompt."},
+    )
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
index 310ede7870..dbc05c6ad5 100644
--- a/trl/trainer/sft_config.py
+++ b/trl/trainer/sft_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 
 from transformers import TrainingArguments
@@ -35,20 +35,20 @@ class SFTConfig(TrainingArguments):
             Controls whether the [`ConstantLengthDataset`] packs the sequences of the dataset.
         learning_rate (`float`, *optional*, defaults to `2e-5`):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of [`~transformers.TrainingArguments`].
-        max_seq_length (`Optional[int]`, *optional*, defaults to `None`):
+        max_seq_length (`int` or `None`, *optional*, defaults to `None`):
             Maximum sequence length for the [`ConstantLengthDataset`] and for automatically creating the dataset. If
             `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset. Only used when `packing=False`.
         dataset_batch_size (`Union[int, None]`, *optional*, defaults to `1000`):
             Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is None`,
             tokenizes the full dataset as a single batch.
-        model_init_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
             string.
-        dataset_kwargs (`Optional[dict[str, Any]]`, *optional*, defaults to `None`):
+        dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
             Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets.
-        eval_packing (`Optional[bool]`, *optional*, defaults to `None`):
+        eval_packing (`bool` or `None`, *optional*, defaults to `None`):
             Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
         num_of_sequences (`int`, *optional*, defaults to `1024`):
             Number of sequences to use for the [`ConstantLengthDataset`].
@@ -59,15 +59,67 @@ class SFTConfig(TrainingArguments):
             Monkey patch the model with Liger kernels to increase throughput and reduce memory usage.
     """
 
-    dataset_text_field: str = "text"
-    packing: bool = False
-    learning_rate: float = 2.0e-5
-    max_seq_length: Optional[int] = None
-    dataset_num_proc: Optional[int] = None
-    dataset_batch_size: int = 1000
-    model_init_kwargs: Optional[dict[str, Any]] = None
-    dataset_kwargs: Optional[dict[str, Any]] = None
-    eval_packing: Optional[bool] = None
-    num_of_sequences: int = 1024
-    chars_per_token: float = 3.6
-    use_liger: bool = False
+    dataset_text_field: str = field(
+        default="text",
+        metadata={
+            "help": "Name of the text field of the dataset. If provided, the trainer will automatically create a "
+            "`ConstantLengthDataset` based on `dataset_text_field`."
+        },
+    )
+    packing: bool = field(
+        default=False,
+        metadata={"help": "Controls whether the `ConstantLengthDataset` packs the sequences of the dataset."},
+    )
+    learning_rate: float = field(
+        default=2.0e-5,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`TrainingArguments`."
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Maximum sequence length for the `ConstantLengthDataset` and for automatically creating the "
+            "dataset. If `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`."
+        },
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset. Only used when `packing=False`."},
+    )
+    dataset_batch_size: int = field(
+        default=1000,
+        metadata={
+            "help": "Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is "
+            "None`, tokenizes the full dataset as a single batch."
+        },
+    )
+    model_init_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model "
+            "from a string."
+        },
+    )
+    dataset_kwargs: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={
+            "help": "Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets."
+        },
+    )
+    eval_packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to pack the eval dataset. If `None`, uses the same value as `packing`."},
+    )
+    num_of_sequences: int = field(
+        default=1024,
+        metadata={"help": "Number of sequences to use for the `ConstantLengthDataset`."},
+    )
+    chars_per_token: float = field(
+        default=3.6, metadata={"help": "Number of characters per token to use for the `ConstantLengthDataset`."}
+    )
+    use_liger: bool = field(
+        default=False,
+        metadata={"help": "Monkey patch the model with Liger kernels to increase throughput and reduce memory usage."},
+    )
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index 5577c57481..f4d8510b9b 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -18,7 +18,7 @@
 import random
 import warnings
 from collections import deque
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from importlib.metadata import version
 from typing import Any, Literal, Optional, Union
 
@@ -73,7 +73,7 @@ class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
             differently if it does not have proper context.
         instruction_template (`Union[str, list[int]]`): the template form that indicates the start of the human instruction, typically something like
             '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids.
-        mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
+        mlm (`bool`, *optional*, defaults to `False`): Whether to use masked language modeling in the underlying
             `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
              for flexibility and backwards-compatibility.
         ignore_index (`int`, *optional*, defaults to `-100`):
@@ -336,7 +336,7 @@ class RewardDataCollatorWithPadding:
             The tokenizer used for encoding the data.
         padding (`Union[bool, str, `PaddingStrategy`]`, `optional`, defaults to `True`):
             padding_strategy to pass to the tokenizer.
-        pad_to_multiple_of (`Optional[int]`, `optional`, defaults to `None`):
+        pad_to_multiple_of (`int` or `None`, `optional`, defaults to `None`):
             If set will pad the sequence to a multiple of the provided value.
         return_tensors (`str`, `optional`, defaults to `"pt"`):
             The tensor type to use.
@@ -463,8 +463,8 @@ class DPODataCollatorWithPadding:
             The tokenizer's pad_token_id.
         label_pad_token_id (`int`, defaults to -100):
             The label used for masking.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            Whether or not you model has an encoder_decoder architecture.
+        is_encoder_decoder (`bool` or `None`, `optional`, defaults to `None`):
+            Whether you model has an encoder_decoder architecture.
     """
 
     pad_token_id: int = 0
@@ -548,7 +548,7 @@ class ConstantLengthDataset(IterableDataset):
             The processor used for processing the data.
         dataset (`dataset.Dataset`):
             Dataset with text files.
-        dataset_text_field (`Optional[str]`, *optional*, defaults to `None`):
+        dataset_text_field (`str` or `None`, *optional*, defaults to `None`):
             Name of the field in the dataset that contains the text. Only one of `dataset_text_field` and
             `formatting_func` should be provided.
         formatting_func (`Callable`, *optional*):
@@ -978,13 +978,13 @@ class OnPolicyConfig(TrainingArguments):
     command line.
 
     Parameters:
-        run_name (`Optional[str]`, *optional*, defaults to `None`):
+        run_name (`str` or `None`, *optional*, defaults to `None`):
             Name of the run.
-        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
             Number of processes to use for processing the dataset.
         num_mini_batches (`int`, *optional*, defaults to `1`):
             Number of minibatches to split a batch into.
-        total_episodes (`Optional[int]`, *optional*, defaults to `None`):
+        total_episodes (`int` or `None`, *optional*, defaults to `None`):
             Total number of episodes in the dataset.
         local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`):
             Per rank no grad forward pass in the rollout phase.
@@ -992,56 +992,125 @@ class OnPolicyConfig(TrainingArguments):
             Number of debugging samples generations (i.e., `generate_completions` calls) throughout training.
         response_length (`int`, *optional*, defaults to `53`):
             Length of the response.
-        stop_token (`Optional[str]`, *optional*, defaults to `None`):
+        stop_token (`str` or `None`, *optional*, defaults to `None`):
             Stop token.
-        stop_token_id (`Optional[int]`, *optional*, defaults to `None`):
+        stop_token_id (`int` or `None`, *optional*, defaults to `None`):
             Truncation token id.
         temperature (`float`, *optional*, defaults to `0.7`):
             Sampling temperature.
-        missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`):
+        missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
             Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage
             to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
             value.
         sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
             Path to the SFT model.
-        world_size (`Optional[int]`, *optional*, defaults to `None`):
+        world_size (`int` or `None`, *optional*, defaults to `None`):
             Number of processes (GPUs) to use for the training.
-        num_total_batches (`Optional[int]`, *optional*, defaults to `None`):
+        num_total_batches (`int` or `None`, *optional*, defaults to `None`):
             Number of total batches to train.
-        micro_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        micro_batch_size (`int` or `None`, *optional*, defaults to `None`):
             Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`).
-        local_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        local_batch_size (`int` or `None`, *optional*, defaults to `None`):
             Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`).
-        batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        batch_size (`int` or `None`, *optional*, defaults to `None`):
             Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`).
-        local_mini_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        local_mini_batch_size (`int` or `None`, *optional*, defaults to `None`):
             Mini batch size per GPU.
-        mini_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+        mini_batch_size (`int` or `None`, *optional*, defaults to `None`):
             Mini batch size across GPUs.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the model to the Hub after training.
     """
 
-    run_name: Optional[str] = None
-    dataset_num_proc: Optional[int] = None
-    num_mini_batches: int = 1
-    total_episodes: Optional[int] = None
-    local_rollout_forward_batch_size: int = 64
-    num_sample_generations: int = 10
-    response_length: int = 53
-    stop_token: Optional[Literal["eos"]] = None
-    stop_token_id: Optional[int] = None
-    temperature: float = 0.7
-    missing_eos_penalty: Optional[float] = None
-    sft_model_path: str = "EleutherAI/pythia-160m"
-    world_size: Optional[int] = None
-    num_total_batches: Optional[int] = None
-    micro_batch_size: Optional[int] = None
-    local_batch_size: Optional[int] = None
-    batch_size: Optional[int] = None
-    local_mini_batch_size: Optional[int] = None
-    mini_batch_size: Optional[int] = None
-    push_to_hub: bool = False
+    run_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the run."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    num_mini_batches: int = field(
+        default=1,
+        metadata={"help": "Number of minibatches to split a batch into."},
+    )
+    total_episodes: Optional[int] = field(
+        default=None,
+        metadata={"help": "Total number of episodes in the dataset."},
+    )
+    local_rollout_forward_batch_size: int = field(
+        default=64,
+        metadata={"help": "Per rank no grad forward pass in the rollout phase."},
+    )
+    num_sample_generations: int = field(
+        default=10,
+        metadata={
+            "help": "Number of debugging samples generations (i.e., `generate_completions` calls) throughout training."
+        },
+    )
+    response_length: int = field(
+        default=53,
+        metadata={"help": "Length of the response."},
+    )
+    stop_token: Optional[Literal["eos"]] = field(
+        default=None,
+        metadata={"help": "Stop token."},
+    )
+    stop_token_id: Optional[int] = field(
+        default=None,
+        metadata={"help": "Truncation token id."},
+    )
+    temperature: float = field(
+        default=0.7,
+        metadata={"help": "Sampling temperature."},
+    )
+    missing_eos_penalty: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to "
+            "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be "
+            "a positive value."
+        },
+    )
+    sft_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the SFT model."},
+    )
+    world_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes (GPUs) to use for the training."},
+    )
+    num_total_batches: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of total batches to train."},
+    )
+    micro_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)."},
+    )
+    local_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)."},
+    )
+    batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * "
+            "`gradient_accumulation_steps`)."
+        },
+    )
+    local_mini_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Mini batch size per GPU."},
+    )
+    mini_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Mini batch size across GPUs."},
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the model to the Hub after training."},
+    )
 
 
 def first_true_indices(bools: torch.Tensor, dtype=torch.long):
diff --git a/trl/trainer/xpo_config.py b/trl/trainer/xpo_config.py
index ffeacbb961..8ae925994b 100644
--- a/trl/trainer/xpo_config.py
+++ b/trl/trainer/xpo_config.py
@@ -26,10 +26,17 @@ class XPOConfig(OnlineDPOConfig):
 
     Parameters:
         alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
-            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch and the last alpha is used for the rest of the epochs.
+            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
+            and the last alpha is used for the rest of the epochs.
     """
 
-    alpha: list[float] = field(default_factory=lambda: [1e-5])
+    alpha: list[float] = field(
+        default_factory=lambda: [1e-5],
+        metadata={
+            "help": "Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each "
+            "new epoch and the last alpha is used for the rest of the epochs."
+        },
+    )
 
     def __post_init__(self):
         super().__post_init__()