diff --git a/src/instructlab/schema/v3/__init__.py b/src/instructlab/schema/v3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/instructlab/schema/v3/knowledge.json b/src/instructlab/schema/v3/knowledge.json new file mode 100644 index 0000000..96dd605 --- /dev/null +++ b/src/instructlab/schema/v3/knowledge.json @@ -0,0 +1,124 @@ +{ + "title": "Knowledge", + "description": "A knowledge skill.", + "type": "object", + "$ref": "./version.json", + "required": [ + "created_by", + "domain", + "seed_examples", + "document", + "document_outline" + ], + "unevaluatedProperties": false, + "properties": { + "created_by": { + "description": "The GitHub username of the contributor.", + "type": "string", + "minLength": 1 + }, + "domain": { + "description": "The knowledge domain which is used in prompts to the teacher model during synthetic data generation. The domain should be brief such as the title to a textbook chapter or section.", + "type": "string", + "minLength": 1, + "examples": [ + "Chemistry", + "History", + "Pop culture" + ] + }, + "seed_examples": { + "description": "An array of seed examples for synthetic data generation.", + "type": "array", + "minItems": 5, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "context", + "questions_and_answers" + ], + "unevaluatedProperties": false, + "properties": { + "context": { + "description": "Context from the document associated with this set of sample q&a pairs.", + "type": "string", + "minLength": 1 + }, + "questions_and_answers": { + "type": "array", + "minItems": 3, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "question", + "answer" + ], + "properties": { + "question": { + "description": "A question used for synthetic data generation.", + "type": "string", + "minLength": 1 + }, + "answer": { + "description": "The desired response for the question.", + "type": "string", + "minLength": 1 + } + } + } + } + } + } + }, + "document": { + "description": "The knowledge documents.", + "type": "object", + "required": [ + "repo", + "commit", + "patterns" + ], + "unevaluatedProperties": false, + "properties": { + "repo": { + "description": "The URL to a Git repository holding the knowledge documents.", + "type": "string", + "minLength": 1, + "examples": [ + "https://github.com/instructlab/instructlab.git" + ] + }, + "commit": { + "description": "The commit in the Git repository containing the knowledge documents.", + "type": "string", + "minLength": 1, + "examples": [ + "951999afdc59c46d325493568193b40bd5439c9e" + ] + }, + "patterns": { + "description": "An array of glob patterns of the knowledge documents in the Git repository.", + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "string", + "minLength": 1, + "examples": [ + "*.md", + "folder/*.md", + "folder/knowledge_doc.md" + ] + } + } + } + }, + "document_outline": { + "description": "An outline of the document.", + "type": "string", + "minLength": 1 + } + } +} diff --git a/src/instructlab/schema/v3/version.json b/src/instructlab/schema/v3/version.json new file mode 100644 index 0000000..8c0dd3e --- /dev/null +++ b/src/instructlab/schema/v3/version.json @@ -0,0 +1,15 @@ +{ + "title": "Taxonomy Document Schema Version", + "type": "object", + "required": [ + "version" + ], + "properties": { + "version": { + "description": "The schema version of the taxonomy document.", + "type": "integer", + "$comment": "This value must match the number in the containing folder.", + "const": 3 + } + } +}