Adding evolcodealpaca dataset (#1821)

* Adding evolcodealpaca dataset * style fix * quality fixes --------- Co-authored-by: Michael Goin <[email protected]>
neuralmagic · Nov 18, 2023 · 2618db1 · 2618db1
1 parent df51fa4
commit 2618db1
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 0 deletions.
diff --git a/src/sparseml/transformers/data/__init__.py b/src/sparseml/transformers/data/__init__.py
@@ -15,6 +15,7 @@
 # flake8: noqa
 from .base_llm import TransformersDataset
 from .c4 import *
+from .evolcodealpaca import *
 from .gsm8k import *
 from .open_platypus import *
 from .ptb import *

diff --git a/src/sparseml/transformers/data/evolcodealpaca.py b/src/sparseml/transformers/data/evolcodealpaca.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.nn import Module
+
+from sparseml.transformers.data.base_llm import TransformersDataset
+
+
+@TransformersDataset.register(name="evolcodealpaca")
+class EvolCodeAlpaca(TransformersDataset):
+    def __init__(
+        self,
+        model: Module,
+        seqlen: int,
+        nsamples: int,
+        seed: int = 0,
+        split: str = "train",
+        split_percent_to_use: float = 1.0,
+    ):
+        super().__init__(
+            model=model,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            path="theblackcat102/evol-codealpaca-v1",
+            name=None,
+            seed=seed,
+            split=split,
+            use_max_tokens=False,
+            split_percent_to_use=split_percent_to_use,
+        )
+
+        processed_data = []
+        for sample in self._data:
+            processed_sample = (
+                "Below is an instruction that describes a "
+                "programming task. Write a program that appropriately "
+                "completes the request.\n\n### Instruction:\n{instruction}"
+                "\n\n### Response:\n"
+            ).format(instruction=sample["instruction"])
+
+            if "output" in sample:
+                processed_sample += sample["output"]
+            processed_data.append(processed_sample)
+            print(processed_sample)
+            exit()
+        self.create_dataloader(processed_data)