From f7486f4c06471f7125556110251a06d69ef1b2ab Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 15 Jul 2024 19:46:49 -0400
Subject: [PATCH] Set gen_kwargs['n'] dynamically in the simple pipelines

We need a way to allow `--num-instructions`, or in the future
`--sdg-scale-factor`, to influence how many instructions we generate
using the simple pipelines. The way to do this seems to be to set `n`
to this value. Since this is a runtime parameter, and we only want to
set it for `n` in certain cases, add a new value for gen_kwargs['n']
called `scaled` which is a hint to use the runtime parameter here.

Closes #130

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 src/instructlab/sdg/llmblock.py                      | 12 +++++++++---
 src/instructlab/sdg/pipelines/schema/v1.json         | 10 +++++++++-
 .../sdg/pipelines/simple/freeform_skills.yaml        |  1 +
 .../sdg/pipelines/simple/grounded_skills.yaml        |  2 +-
 src/instructlab/sdg/pipelines/simple/knowledge.yaml  |  1 +
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
index 8d0b26c4..583f38d4 100644
--- a/src/instructlab/sdg/llmblock.py
+++ b/src/instructlab/sdg/llmblock.py
@@ -133,8 +133,15 @@ def _gen_kwargs(self, **gen_kwargs):
             gen_kwargs["max_tokens"] = int(gen_kwargs["max_tokens"])
         if "temperature" in gen_kwargs:
             gen_kwargs["temperature"] = float(gen_kwargs["temperature"])
+        gen_kwargs["n"] = self._get_n(gen_kwargs)
         return gen_kwargs
 
+    def _get_n(self, gen_kwargs):
+        n = gen_kwargs.get("n", 1)
+        if isinstance(n, str) and n == "scaled":
+            n = self.ctx.num_instructions_to_generate
+        return n
+
     def _generate(self, samples, **gen_kwargs) -> list:
         prompts = [
             self.model_prompt.format(prompt=self._format_prompt(sample))
@@ -148,10 +155,9 @@ def _generate(self, samples, **gen_kwargs) -> list:
             )
             return [choice.text.strip() for choice in response.choices]
 
-        n = gen_kwargs.get("n", 1)
         results = []
         for prompt in prompts:
-            for _ in range(n):
+            for _ in range(generate_args["n"]):
                 response = self.ctx.client.completions.create(
                     prompt=prompt, **generate_args
                 )
@@ -193,7 +199,7 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset:
         outputs = self._generate(samples, **gen_kwargs)
         logger.debug("Generated outputs: %s", outputs)
 
-        num_parallel_samples = gen_kwargs.get("n", 1)
+        num_parallel_samples = self._get_n(gen_kwargs)
         extended_samples = []
 
         # Duplicate each input sample n times, where n is the number
diff --git a/src/instructlab/sdg/pipelines/schema/v1.json b/src/instructlab/sdg/pipelines/schema/v1.json
index 692be6c3..ce807a83 100644
--- a/src/instructlab/sdg/pipelines/schema/v1.json
+++ b/src/instructlab/sdg/pipelines/schema/v1.json
@@ -46,7 +46,15 @@
                 "type": "number"
               },
               "n": {
-                "type": "number"
+                "oneOf": [
+                  {
+                    "type": "number"
+                  },
+                  {
+                    "type": "string",
+                    "enum": ["scaled"]
+                  }
+                ]
               },
               "seed": {
                 "type": "number"
diff --git a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
index be589af8..a528d0a3 100644
--- a/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/freeform_skills.yaml
@@ -9,5 +9,6 @@ blocks:
     gen_kwargs:
       max_tokens: 2048
       temperature: 0.7
+      n: scaled
     drop_duplicates:
       - output
diff --git a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
index 23925034..90c32d03 100644
--- a/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
+++ b/src/instructlab/sdg/pipelines/simple/grounded_skills.yaml
@@ -9,6 +9,6 @@ blocks:
     gen_kwargs:
       max_tokens: 2048
       temperature: 0.7
-      n: 10
+      n: scaled
     drop_duplicates:
       - output
diff --git a/src/instructlab/sdg/pipelines/simple/knowledge.yaml b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
index 7e2cdc4f..0df81659 100644
--- a/src/instructlab/sdg/pipelines/simple/knowledge.yaml
+++ b/src/instructlab/sdg/pipelines/simple/knowledge.yaml
@@ -9,5 +9,6 @@ blocks:
     gen_kwargs:
       max_tokens: 2048
       temperature: 0.7
+      n: scaled
     drop_duplicates:
     - output