Skip to content

Commit

Permalink
Introduce v3 schema
Browse files Browse the repository at this point in the history
Closes #38

v3 includes some backwards incompatible changes to the knowledge
schema format. Here is a diff against v2. The changes are:

- Q&A pairs now have an associated context blob from the knowledge
  document.

- There is new `document_outline` field.

- drop `task_description`

```diff

--- src/instructlab/schema/v2/knowledge.json	2024-07-17 12:56:37
+++ src/instructlab/schema/v3/knowledge.json	2024-07-17 15:38:30
@@ -6,9 +6,9 @@
     "required": [
         "created_by",
         "domain",
-        "task_description",
         "seed_examples",
-        "document"
+        "document",
+        "document_outline"
     ],
     "unevaluatedProperties": false,
     "properties": {
@@ -27,15 +27,6 @@
                 "Pop culture"
             ]
         },
-        "task_description": {
-            "description": "A description of the task which is used in prompts to the teacher model during synthetic data generation. The description should be detailed and prescriptive to improve the teacher model's responses.",
-            "type": "string",
-            "minLength": 1,
-            "examples": [
-                "To teach a language model about softball history",
-                "To teach a language model about tabby cats"
-            ]
-        },
         "seed_examples": {
             "description": "An array of seed examples for synthetic data generation.",
             "type": "array",
@@ -44,20 +35,39 @@
             "items": {
                 "type": "object",
                 "required": [
-                    "question",
-                    "answer"
+                    "context",
+                    "questions_and_answers"
                 ],
                 "unevaluatedProperties": false,
                 "properties": {
-                    "question": {
-                        "description": "A question used for synthetic data generation.",
+                    "context": {
+                        "description": "Context from the document associated with this set of sample q&a pairs.",
                         "type": "string",
                         "minLength": 1
                     },
-                    "answer": {
-                        "description": "The desired response for the question.",
-                        "type": "string",
-                        "minLength": 1
+                    "questions_and_answers": {
+                        "type": "array",
+                        "minItems": 3,
+                        "uniqueItems": true,
+                        "items": {
+                            "type": "object",
+                            "required": [
+                                "question",
+                                "answer"
+                            ],
+                            "properties": {
+                                "question": {
+                                    "description": "A question used for synthetic data generation.",
+                                    "type": "string",
+                                    "minLength": 1
+                                },
+                                "answer": {
+                                    "description": "The desired response for the question.",
+                                    "type": "string",
+                                    "minLength": 1
+                                }
+                            }
+                        }
                     }
                 }
             }
@@ -104,6 +114,11 @@
                     }
                 }
             }
+        },
+        "document_outline": {
+            "description": "An outline of the document.",
+            "type": "string",
+            "minLength": 1
         }
     }
 }
```

Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
russellb committed Jul 17, 2024
1 parent 859cb72 commit 2be6938
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 0 deletions.
Empty file.
124 changes: 124 additions & 0 deletions src/instructlab/schema/v3/knowledge.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"title": "Knowledge",
"description": "A knowledge skill.",
"type": "object",
"$ref": "./version.json",
"required": [
"created_by",
"domain",
"seed_examples",
"document",
"document_outline"
],
"unevaluatedProperties": false,
"properties": {
"created_by": {
"description": "The GitHub username of the contributor.",
"type": "string",
"minLength": 1
},
"domain": {
"description": "The knowledge domain which is used in prompts to the teacher model during synthetic data generation. The domain should be brief such as the title to a textbook chapter or section.",
"type": "string",
"minLength": 1,
"examples": [
"Chemistry",
"History",
"Pop culture"
]
},
"seed_examples": {
"description": "An array of seed examples for synthetic data generation.",
"type": "array",
"minItems": 5,
"uniqueItems": true,
"items": {
"type": "object",
"required": [
"context",
"questions_and_answers"
],
"unevaluatedProperties": false,
"properties": {
"context": {
"description": "Context from the document associated with this set of sample q&a pairs.",
"type": "string",
"minLength": 1
},
"questions_and_answers": {
"type": "array",
"minItems": 3,
"uniqueItems": true,
"items": {
"type": "object",
"required": [
"question",
"answer"
],
"properties": {
"question": {
"description": "A question used for synthetic data generation.",
"type": "string",
"minLength": 1
},
"answer": {
"description": "The desired response for the question.",
"type": "string",
"minLength": 1
}
}
}
}
}
}
},
"document": {
"description": "The knowledge documents.",
"type": "object",
"required": [
"repo",
"commit",
"patterns"
],
"unevaluatedProperties": false,
"properties": {
"repo": {
"description": "The URL to a Git repository holding the knowledge documents.",
"type": "string",
"minLength": 1,
"examples": [
"https://github.com/instructlab/instructlab.git"
]
},
"commit": {
"description": "The commit in the Git repository containing the knowledge documents.",
"type": "string",
"minLength": 1,
"examples": [
"951999afdc59c46d325493568193b40bd5439c9e"
]
},
"patterns": {
"description": "An array of glob patterns of the knowledge documents in the Git repository.",
"type": "array",
"minItems": 1,
"uniqueItems": true,
"items": {
"type": "string",
"minLength": 1,
"examples": [
"*.md",
"folder/*.md",
"folder/knowledge_doc.md"
]
}
}
}
},
"document_outline": {
"description": "An outline of the document.",
"type": "string",
"minLength": 1
}
}
}
15 changes: 15 additions & 0 deletions src/instructlab/schema/v3/version.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"title": "Taxonomy Document Schema Version",
"type": "object",
"required": [
"version"
],
"properties": {
"version": {
"description": "The schema version of the taxonomy document.",
"type": "integer",
"$comment": "This value must match the number in the containing folder.",
"const": 3
}
}
}

0 comments on commit 2be6938

Please sign in to comment.