From 2618db132d4a01165dcd9bda77d6ee5edd882684 Mon Sep 17 00:00:00 2001
From: abhinavnmagic <121893843+abhinavnmagic@users.noreply.github.com>
Date: Fri, 17 Nov 2023 16:12:50 -0800
Subject: [PATCH] Adding evolcodealpaca dataset (#1821)

* Adding evolcodealpaca dataset

* style fix

* quality fixes

---------

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 src/sparseml/transformers/data/__init__.py    |  1 +
 .../transformers/data/evolcodealpaca.py       | 57 +++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 src/sparseml/transformers/data/evolcodealpaca.py

diff --git a/src/sparseml/transformers/data/__init__.py b/src/sparseml/transformers/data/__init__.py
index fc417ea2548..fbcdb2c4bd2 100644
--- a/src/sparseml/transformers/data/__init__.py
+++ b/src/sparseml/transformers/data/__init__.py
@@ -15,6 +15,7 @@
 # flake8: noqa
 from .base_llm import TransformersDataset
 from .c4 import *
+from .evolcodealpaca import *
 from .gsm8k import *
 from .open_platypus import *
 from .ptb import *
diff --git a/src/sparseml/transformers/data/evolcodealpaca.py b/src/sparseml/transformers/data/evolcodealpaca.py
new file mode 100644
index 00000000000..4467bf56571
--- /dev/null
+++ b/src/sparseml/transformers/data/evolcodealpaca.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.nn import Module
+
+from sparseml.transformers.data.base_llm import TransformersDataset
+
+
+@TransformersDataset.register(name="evolcodealpaca")
+class EvolCodeAlpaca(TransformersDataset):
+    def __init__(
+        self,
+        model: Module,
+        seqlen: int,
+        nsamples: int,
+        seed: int = 0,
+        split: str = "train",
+        split_percent_to_use: float = 1.0,
+    ):
+        super().__init__(
+            model=model,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            path="theblackcat102/evol-codealpaca-v1",
+            name=None,
+            seed=seed,
+            split=split,
+            use_max_tokens=False,
+            split_percent_to_use=split_percent_to_use,
+        )
+
+        processed_data = []
+        for sample in self._data:
+            processed_sample = (
+                "Below is an instruction that describes a "
+                "programming task. Write a program that appropriately "
+                "completes the request.\n\n### Instruction:\n{instruction}"
+                "\n\n### Response:\n"
+            ).format(instruction=sample["instruction"])
+
+            if "output" in sample:
+                processed_sample += sample["output"]
+            processed_data.append(processed_sample)
+            print(processed_sample)
+            exit()
+        self.create_dataloader(processed_data)