From 2618db132d4a01165dcd9bda77d6ee5edd882684 Mon Sep 17 00:00:00 2001 From: abhinavnmagic <121893843+abhinavnmagic@users.noreply.github.com> Date: Fri, 17 Nov 2023 16:12:50 -0800 Subject: [PATCH] Adding evolcodealpaca dataset (#1821) * Adding evolcodealpaca dataset * style fix * quality fixes --------- Co-authored-by: Michael Goin --- src/sparseml/transformers/data/__init__.py | 1 + .../transformers/data/evolcodealpaca.py | 57 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 src/sparseml/transformers/data/evolcodealpaca.py diff --git a/src/sparseml/transformers/data/__init__.py b/src/sparseml/transformers/data/__init__.py index fc417ea2548..fbcdb2c4bd2 100644 --- a/src/sparseml/transformers/data/__init__.py +++ b/src/sparseml/transformers/data/__init__.py @@ -15,6 +15,7 @@ # flake8: noqa from .base_llm import TransformersDataset from .c4 import * +from .evolcodealpaca import * from .gsm8k import * from .open_platypus import * from .ptb import * diff --git a/src/sparseml/transformers/data/evolcodealpaca.py b/src/sparseml/transformers/data/evolcodealpaca.py new file mode 100644 index 00000000000..4467bf56571 --- /dev/null +++ b/src/sparseml/transformers/data/evolcodealpaca.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from torch.nn import Module + +from sparseml.transformers.data.base_llm import TransformersDataset + + +@TransformersDataset.register(name="evolcodealpaca") +class EvolCodeAlpaca(TransformersDataset): + def __init__( + self, + model: Module, + seqlen: int, + nsamples: int, + seed: int = 0, + split: str = "train", + split_percent_to_use: float = 1.0, + ): + super().__init__( + model=model, + seqlen=seqlen, + nsamples=nsamples, + path="theblackcat102/evol-codealpaca-v1", + name=None, + seed=seed, + split=split, + use_max_tokens=False, + split_percent_to_use=split_percent_to_use, + ) + + processed_data = [] + for sample in self._data: + processed_sample = ( + "Below is an instruction that describes a " + "programming task. Write a program that appropriately " + "completes the request.\n\n### Instruction:\n{instruction}" + "\n\n### Response:\n" + ).format(instruction=sample["instruction"]) + + if "output" in sample: + processed_sample += sample["output"] + processed_data.append(processed_sample) + print(processed_sample) + exit() + self.create_dataloader(processed_data)