WIP add next chapter buttons, fix title link (#29)

natolambert · Jan 5, 2025 · 969d4e4 · 969d4e4
1 parent 012c844
commit 969d4e4
Show file tree

Hide file tree

Showing 19 changed files with 138 additions and 3 deletions.
diff --git a/chapters/01-introduction.md b/chapters/01-introduction.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Home"
+prev-url: "https://rlhfbook.com/"
+next-chapter: "Key Related Works"
+next-url: "02-related-works.html"
+---
+
 # Introduction
 
 Reinforcement learning from Human Feedback (RLHF) is a technique used to incorporate human information into AI systems.

diff --git a/chapters/04-related-works.md → chapters/02-related-works.md b/chapters/04-related-works.md → chapters/02-related-works.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Introduction"
+prev-url: "01-introduction.html"
+next-chapter: "Problem Setup"
+next-url: "03-setup.html"
+---
+
 # Key Related Works
 
 In this chapter we detail the key papers and projects that got the RLHF field to where it is today.

diff --git a/chapters/05-setup.md → chapters/03-setup.md b/chapters/05-setup.md → chapters/03-setup.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Key Related Works"
+prev-url: "02-related-works.html"
+next-chapter: "Problem Formulation"
+next-url: "04-optimization.html"
+---
+
 # Definitions
 
 This chapter includes all the definitions, symbols, and operatings frequently used in the RLHF process.

diff --git a/chapters/03-optimization.md → chapters/04-optimization.md b/chapters/03-optimization.md → chapters/04-optimization.md
@@ -1,3 +1,9 @@
+---
+prev-chapter: "Problem Setup"
+prev-url: "03-setup.html"
+next-chapter: "The Nature of Preferences"
+next-url: "05-preferences.html"
+---
 
 # Problem Formulation
 

diff --git a/chapters/02-preferences.md → chapters/05-preferences.md b/chapters/02-preferences.md → chapters/05-preferences.md
@@ -1,3 +1,9 @@
+---
+prev-chapter: "Problem Formulation"
+prev-url: "04-optimization.html"
+next-chapter: "Preference Data"
+next-url: "06-preference-data.html"
+---
 
 # The Nature of Preferences
 

diff --git a/chapters/06-preference-data.md b/chapters/06-preference-data.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "The Nature of Preferences"
+prev-url: "05-preferences.html"
+next-chapter: "Reward Modeling"
+next-url: "07-reward-models.html"
+---
+
 # [Incomplete] Preference Data
 
 ## Collecting Preference Data

diff --git a/chapters/07-reward-models.md b/chapters/07-reward-models.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Preference Data"
+prev-url: "06-preference-data.html"
+next-chapter: "Regularization"
+next-url: "08-regularization.html"
+---
+
 # Reward Modeling
 
 Reward models are core to the modern approach to RLHF.

diff --git a/chapters/08-regularization.md b/chapters/08-regularization.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Reward Modeling"
+prev-url: "07-reward-models.html"
+next-chapter: "Instruction Tuning"
+next-url: "09-instruction-tuning.html"
+---
+
 # Regularization
 
 Throughout the RLHF optimization, many regularization steps are used to prevent over-optimization of the reward model.

diff --git a/chapters/09-instruction-tuning.md b/chapters/09-instruction-tuning.md
@@ -1 +1,8 @@
+---
+prev-chapter: "Regularization"
+prev-url: "08-regularization.html"
+next-chapter: "Rejection Sampling"
+next-url: "10-rejection-sampling.html"
+---
+
 # Instruction Tuning
diff --git a/chapters/10-rejection-sampling.md b/chapters/10-rejection-sampling.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Instruction Tuning"
+prev-url: "09-instruction-tuning.html"
+next-chapter: "Policy Gradients"
+next-url: "11-policy-gradients.html"
+---
+
 # Rejection Sampling
 
 Rejection Sampling (RS) is a popular and simple baseline for performing preference fine-tuning. 

diff --git a/chapters/11-policy-gradients.md b/chapters/11-policy-gradients.md
@@ -1,3 +1,10 @@
+---
+prev-chapter: "Rejection Sampling"
+prev-url: "10-rejection-sampling.html"
+next-chapter: "Direct Alignment Algorithms"
+next-url: "12-direct-alignment.html"
+---
+
 # [Incomplete] Policy Gradient Algorithms
 
 
@@ -25,6 +32,7 @@ $$\nabla_\theta J(\pi_\theta) = \mathbb{E}_\tau \left[ \sum_{t=0}^T \nabla_\thet
 
 Reinforce is a specific implementation of vanilla policy gradient that uses a Monte Carlo estimator of the gradient.
 [@ahmadian2024back]
+
 ### Proximal Policy Optimization
 
 ## Computing Policy Gradients with a Language Model

diff --git a/chapters/12-direct-alignment.md b/chapters/12-direct-alignment.md
@@ -1 +1,8 @@
+---
+prev-chapter: "Policy Gradients"
+prev-url: "11-policy-gradients.html"
+next-chapter: "Constitutional AI"
+next-url: "13-cai.html"
+---
+
 # [Incomplete] Direct Alignment Algorithms
diff --git a/chapters/13-cai.md b/chapters/13-cai.md
@@ -1 +1,8 @@
+---
+prev-chapter: "Direct Alignment"
+prev-url: "12-direct-alignment.html"
+next-chapter: "Reasoning Models"
+next-url: "14-reasoning.html"
+---
+
 # [Incomplete] Constitutional AI and AI Feedback
diff --git a/chapters/14-reasoning.md b/chapters/14-reasoning.md
@@ -1 +1,8 @@
-# [Incomplete] Constitutional AI
+---
+prev-chapter: ""
+prev-url: ""
+next-chapter: ""
+next-url: ""
+---
+
+# [Incomplete] Reasoning Training and Models
diff --git a/chapters/15-synthetic.md b/chapters/15-synthetic.md
@@ -1 +1,8 @@
+---
+prev-chapter: ""
+prev-url: ""
+next-chapter: ""
+next-url: ""
+---
+
 # [Incomplete] Synthetic Data
diff --git a/chapters/16-evaluation.md b/chapters/16-evaluation.md
@@ -1 +1,8 @@
+---
+prev-chapter: ""
+prev-url: ""
+next-chapter: ""
+next-url: ""
+---
+
 # [Incomplete] Evaluation
diff --git a/chapters/17-over-optimization.md b/chapters/17-over-optimization.md
@@ -1 +1,8 @@
+---
+prev-chapter: ""
+prev-url: ""
+next-chapter: ""
+next-url: ""
+---
+
 # [Incomplete] Over Optimization
diff --git a/templates/chapter.html b/templates/chapter.html
@@ -51,7 +51,7 @@
 $endfor$
 $if(title)$
 <header id="title-block-header">
-<h1 class="title"><a href="www.rlhfbook.com" style="color: inherit; text-decoration: none;">$title$</a></h1>
+<h1 class="title"><a href="https://rlhfbook.com/" style="color: inherit; text-decoration: none;">$title$</a></h1>
 $if(subtitle)$
 <p class="subtitle">$subtitle$</p>
 $endif$
@@ -76,6 +76,23 @@ <h2 id="$idprefix$toc-title">$toc-title$</h2>
 <div id="content">
   $body$
 </div>
+
+<div id="chapter-navigation" style="display: flex; justify-content: space-between; padding: 2em 0;">
+  $if(prev-url)$
+  <a href="$prev-url$" class="prev-chapter">
+    ← Previous: $prev-chapter$
+  </a>
+  $else$
+  <div></div>
+  $endif$
+
+  $if(next-url)$
+  <a href="$next-url$" class="next-chapter">
+    Next: $next-chapter$ →
+  </a>
+  $endif$
+</div>
+
 $for(include-after)$
 $include-after$
 $endfor$

diff --git a/templates/html.html b/templates/html.html
@@ -71,7 +71,7 @@ <h2>Abstract</h2>
 <body>
   <section id="acknowledgements" style="padding: 20px; text-align: center;">
     <h2>Acknowledgements</h2>
-    <p>I would like to thank the following people who helped me with this project: Costa Huang, </p>
+    <p>I would like to thank the following people who helped me with this project: Costa Huang, (and of course Claude)</p>
     <p>Additionally, thank you to the <a href="https://github.com/natolambert/rlhf-book/graphs/contributors">contributors on GitHub</a> who helped improve this project.</p>
   </section>
   <footer style="padding: 20px; text-align: center;">