Skip to content

Commit

Permalink
Adding evolcodealpaca dataset (#1821)
Browse files Browse the repository at this point in the history
* Adding evolcodealpaca dataset

* style fix

* quality fixes

---------

Co-authored-by: Michael Goin <[email protected]>
  • Loading branch information
abhinavnmagic and mgoin authored Nov 18, 2023
1 parent df51fa4 commit 2618db1
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/sparseml/transformers/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# flake8: noqa
from .base_llm import TransformersDataset
from .c4 import *
from .evolcodealpaca import *
from .gsm8k import *
from .open_platypus import *
from .ptb import *
Expand Down
57 changes: 57 additions & 0 deletions src/sparseml/transformers/data/evolcodealpaca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from torch.nn import Module

from sparseml.transformers.data.base_llm import TransformersDataset


@TransformersDataset.register(name="evolcodealpaca")
class EvolCodeAlpaca(TransformersDataset):
def __init__(
self,
model: Module,
seqlen: int,
nsamples: int,
seed: int = 0,
split: str = "train",
split_percent_to_use: float = 1.0,
):
super().__init__(
model=model,
seqlen=seqlen,
nsamples=nsamples,
path="theblackcat102/evol-codealpaca-v1",
name=None,
seed=seed,
split=split,
use_max_tokens=False,
split_percent_to_use=split_percent_to_use,
)

processed_data = []
for sample in self._data:
processed_sample = (
"Below is an instruction that describes a "
"programming task. Write a program that appropriately "
"completes the request.\n\n### Instruction:\n{instruction}"
"\n\n### Response:\n"
).format(instruction=sample["instruction"])

if "output" in sample:
processed_sample += sample["output"]
processed_data.append(processed_sample)
print(processed_sample)
exit()
self.create_dataloader(processed_data)

0 comments on commit 2618db1

Please sign in to comment.