diff --git a/chapters/01-introduction.md b/chapters/01-introduction.md index 53e3283..7e95c5b 100644 --- a/chapters/01-introduction.md +++ b/chapters/01-introduction.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Home" +prev-url: "https://rlhfbook.com/" +next-chapter: "Key Related Works" +next-url: "02-related-works.html" +--- + # Introduction Reinforcement learning from Human Feedback (RLHF) is a technique used to incorporate human information into AI systems. diff --git a/chapters/04-related-works.md b/chapters/02-related-works.md similarity index 97% rename from chapters/04-related-works.md rename to chapters/02-related-works.md index 68a9795..6c28957 100644 --- a/chapters/04-related-works.md +++ b/chapters/02-related-works.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Introduction" +prev-url: "01-introduction.html" +next-chapter: "Problem Setup" +next-url: "03-setup.html" +--- + # Key Related Works In this chapter we detail the key papers and projects that got the RLHF field to where it is today. diff --git a/chapters/05-setup.md b/chapters/03-setup.md similarity index 96% rename from chapters/05-setup.md rename to chapters/03-setup.md index 50f7161..93b7ba8 100644 --- a/chapters/05-setup.md +++ b/chapters/03-setup.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Key Related Works" +prev-url: "02-related-works.html" +next-chapter: "Problem Formulation" +next-url: "04-optimization.html" +--- + # Definitions This chapter includes all the definitions, symbols, and operatings frequently used in the RLHF process. diff --git a/chapters/03-optimization.md b/chapters/04-optimization.md similarity index 95% rename from chapters/03-optimization.md rename to chapters/04-optimization.md index c49c300..4ec1631 100644 --- a/chapters/03-optimization.md +++ b/chapters/04-optimization.md @@ -1,3 +1,9 @@ +--- +prev-chapter: "Problem Setup" +prev-url: "03-setup.html" +next-chapter: "The Nature of Preferences" +next-url: "05-preferences.html" +--- # Problem Formulation diff --git a/chapters/02-preferences.md b/chapters/05-preferences.md similarity index 97% rename from chapters/02-preferences.md rename to chapters/05-preferences.md index 7c83b52..52dd329 100644 --- a/chapters/02-preferences.md +++ b/chapters/05-preferences.md @@ -1,3 +1,9 @@ +--- +prev-chapter: "Problem Formulation" +prev-url: "04-optimization.html" +next-chapter: "Preference Data" +next-url: "06-preference-data.html" +--- # The Nature of Preferences diff --git a/chapters/06-preference-data.md b/chapters/06-preference-data.md index 64a9206..1879910 100644 --- a/chapters/06-preference-data.md +++ b/chapters/06-preference-data.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "The Nature of Preferences" +prev-url: "05-preferences.html" +next-chapter: "Reward Modeling" +next-url: "07-reward-models.html" +--- + # [Incomplete] Preference Data ## Collecting Preference Data diff --git a/chapters/07-reward-models.md b/chapters/07-reward-models.md index 5792180..ddef25f 100644 --- a/chapters/07-reward-models.md +++ b/chapters/07-reward-models.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Preference Data" +prev-url: "06-preference-data.html" +next-chapter: "Regularization" +next-url: "08-regularization.html" +--- + # Reward Modeling Reward models are core to the modern approach to RLHF. diff --git a/chapters/08-regularization.md b/chapters/08-regularization.md index 05fbdab..c32067a 100644 --- a/chapters/08-regularization.md +++ b/chapters/08-regularization.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Reward Modeling" +prev-url: "07-reward-models.html" +next-chapter: "Instruction Tuning" +next-url: "09-instruction-tuning.html" +--- + # Regularization Throughout the RLHF optimization, many regularization steps are used to prevent over-optimization of the reward model. diff --git a/chapters/09-instruction-tuning.md b/chapters/09-instruction-tuning.md index 0695aa8..99b6244 100644 --- a/chapters/09-instruction-tuning.md +++ b/chapters/09-instruction-tuning.md @@ -1 +1,8 @@ +--- +prev-chapter: "Regularization" +prev-url: "08-regularization.html" +next-chapter: "Rejection Sampling" +next-url: "10-rejection-sampling.html" +--- + # Instruction Tuning \ No newline at end of file diff --git a/chapters/10-rejection-sampling.md b/chapters/10-rejection-sampling.md index a2315f4..2d8c427 100644 --- a/chapters/10-rejection-sampling.md +++ b/chapters/10-rejection-sampling.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Instruction Tuning" +prev-url: "09-instruction-tuning.html" +next-chapter: "Policy Gradients" +next-url: "11-policy-gradients.html" +--- + # Rejection Sampling Rejection Sampling (RS) is a popular and simple baseline for performing preference fine-tuning. diff --git a/chapters/11-policy-gradients.md b/chapters/11-policy-gradients.md index ae06643..16c1f5c 100644 --- a/chapters/11-policy-gradients.md +++ b/chapters/11-policy-gradients.md @@ -1,3 +1,10 @@ +--- +prev-chapter: "Rejection Sampling" +prev-url: "10-rejection-sampling.html" +next-chapter: "Direct Alignment Algorithms" +next-url: "12-direct-alignment.html" +--- + # [Incomplete] Policy Gradient Algorithms @@ -25,6 +32,7 @@ $$\nabla_\theta J(\pi_\theta) = \mathbb{E}_\tau \left[ \sum_{t=0}^T \nabla_\thet Reinforce is a specific implementation of vanilla policy gradient that uses a Monte Carlo estimator of the gradient. [@ahmadian2024back] + ### Proximal Policy Optimization ## Computing Policy Gradients with a Language Model diff --git a/chapters/12-direct-alignment.md b/chapters/12-direct-alignment.md index 35c71ee..5dcb85e 100644 --- a/chapters/12-direct-alignment.md +++ b/chapters/12-direct-alignment.md @@ -1 +1,8 @@ +--- +prev-chapter: "Policy Gradients" +prev-url: "11-policy-gradients.html" +next-chapter: "Constitutional AI" +next-url: "13-cai.html" +--- + # [Incomplete] Direct Alignment Algorithms \ No newline at end of file diff --git a/chapters/13-cai.md b/chapters/13-cai.md index 6ec53da..ed622d9 100644 --- a/chapters/13-cai.md +++ b/chapters/13-cai.md @@ -1 +1,8 @@ +--- +prev-chapter: "Direct Alignment" +prev-url: "12-direct-alignment.html" +next-chapter: "Reasoning Models" +next-url: "14-reasoning.html" +--- + # [Incomplete] Constitutional AI and AI Feedback \ No newline at end of file diff --git a/chapters/14-reasoning.md b/chapters/14-reasoning.md index d6ca235..0f77a26 100644 --- a/chapters/14-reasoning.md +++ b/chapters/14-reasoning.md @@ -1 +1,8 @@ -# [Incomplete] Constitutional AI \ No newline at end of file +--- +prev-chapter: "" +prev-url: "" +next-chapter: "" +next-url: "" +--- + +# [Incomplete] Reasoning Training and Models \ No newline at end of file diff --git a/chapters/15-synthetic.md b/chapters/15-synthetic.md index 1a60443..30c9bcc 100644 --- a/chapters/15-synthetic.md +++ b/chapters/15-synthetic.md @@ -1 +1,8 @@ +--- +prev-chapter: "" +prev-url: "" +next-chapter: "" +next-url: "" +--- + # [Incomplete] Synthetic Data \ No newline at end of file diff --git a/chapters/16-evaluation.md b/chapters/16-evaluation.md index d58b91b..f258bcd 100644 --- a/chapters/16-evaluation.md +++ b/chapters/16-evaluation.md @@ -1 +1,8 @@ +--- +prev-chapter: "" +prev-url: "" +next-chapter: "" +next-url: "" +--- + # [Incomplete] Evaluation \ No newline at end of file diff --git a/chapters/17-over-optimization.md b/chapters/17-over-optimization.md index c914415..7f38d92 100644 --- a/chapters/17-over-optimization.md +++ b/chapters/17-over-optimization.md @@ -1 +1,8 @@ +--- +prev-chapter: "" +prev-url: "" +next-chapter: "" +next-url: "" +--- + # [Incomplete] Over Optimization \ No newline at end of file diff --git a/templates/chapter.html b/templates/chapter.html index 5eb26df..a7782c0 100644 --- a/templates/chapter.html +++ b/templates/chapter.html @@ -51,7 +51,7 @@ $endfor$ $if(title)$
-

$title$

+

$title$

$if(subtitle)$

$subtitle$

$endif$ @@ -76,6 +76,23 @@

$toc-title$

$body$
+ +
+ $if(prev-url)$ + + ← Previous: $prev-chapter$ + + $else$ +
+ $endif$ + + $if(next-url)$ + + Next: $next-chapter$ → + + $endif$ +
+ $for(include-after)$ $include-after$ $endfor$ diff --git a/templates/html.html b/templates/html.html index 61ce6ef..03b6204 100644 --- a/templates/html.html +++ b/templates/html.html @@ -71,7 +71,7 @@

Abstract

Acknowledgements

-

I would like to thank the following people who helped me with this project: Costa Huang,

+

I would like to thank the following people who helped me with this project: Costa Huang, (and of course Claude)

Additionally, thank you to the contributors on GitHub who helped improve this project.