Skip to content

Commit

Permalink
remove decapoda-research/llama-7b-hf tokenizer and skip tests if meta…
Browse files Browse the repository at this point in the history
…-llama/Llama-2-7b is not available
  • Loading branch information
mreso committed Dec 8, 2023
1 parent 15e4bd9 commit 0022d97
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 15 deletions.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,9 @@ exclude = [
packages = ["src/llama_recipes"]

[tool.hatch.metadata.hooks.requirements_txt]
files = ["requirements.txt"]
files = ["requirements.txt"]

[tool.pytest.ini_options]
markers = [
"skip_missing_tokenizer: skip tests when we can not access meta-llama/Llama-2-7b-hf on huggingface hub (Log in with `huggingface-cli login` to unskip).",
]
22 changes: 16 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@

from transformers import LlamaTokenizer

@pytest.fixture(scope="module")
def llama_tokenizer():
try:
return LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
except OSError:
return None


@pytest.fixture
def setup_tokenizer():
def _helper(tokenizer):
def setup_tokenizer(llama_tokenizer):
def _helper(tokenizer_mock):
#Align with Llama 2 tokenizer
tokenizer.from_pretrained.return_value = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
tokenizer.from_pretrained.return_value.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>'})
tokenizer.from_pretrained.return_value.bos_token_id = 1
tokenizer.from_pretrained.return_value.eos_token_id = 2
tokenizer_mock.from_pretrained.return_value = llama_tokenizer

return _helper

@pytest.fixture(autouse=True)
def skip_if_tokenizer_is_missing(request, llama_tokenizer):
if request.node.get_closest_marker("skip_missing_tokenizer"):
if llama_tokenizer is None:
pytest.skip("Llama tokenizer could not be accessed. Did you log into huggingface hub and provided the correct token?")
3 changes: 2 additions & 1 deletion tests/datasets/test_custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def check_padded_entry(batch):
assert batch["input_ids"][0][-1] == 2


@pytest.mark.skip_missing_tokenizer()
@patch('llama_recipes.finetuning.train')
@patch('llama_recipes.finetuning.LlamaTokenizer')
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
Expand All @@ -29,7 +30,7 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,

kwargs = {
"dataset": "custom_dataset",
"model_name": "decapoda-research/llama-7b-hf", # We use the tokenizer as a surrogate for llama2 tokenizer here
"model_name": "meta-llama/Llama-2-7b-hf",
"custom_dataset.file": "examples/custom_dataset.py",
"custom_dataset.train_split": "validation",
"batch_size_training": 2,
Expand Down
8 changes: 5 additions & 3 deletions tests/datasets/test_grammar_datasets.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

import pytest
from unittest.mock import patch

from transformers import LlamaTokenizer


@pytest.mark.skip_missing_tokenizer()
@patch('llama_recipes.finetuning.train')
@patch('llama_recipes.finetuning.LlamaTokenizer')
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
Expand All @@ -18,7 +20,7 @@ def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker

BATCH_SIZE = 8
kwargs = {
"model_name": "decapoda-research/llama-7b-hf",
"model_name": "meta-llama/Llama-2-7b-hf",
"batch_size_training": BATCH_SIZE,
"val_batch_size": 1,
"use_peft": False,
Expand Down Expand Up @@ -46,8 +48,8 @@ def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker
assert "input_ids" in batch.keys()
assert "attention_mask" in batch.keys()

assert batch["labels"][0][29] == -100
assert batch["labels"][0][30] == 29871
assert batch["labels"][0][31] == -100
assert batch["labels"][0][32] == 1152

assert batch["input_ids"][0][0] == 1
assert batch["labels"][0][-1] == 2
Expand Down
6 changes: 4 additions & 2 deletions tests/datasets/test_samsum_datasets.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

import pytest
from functools import partial
from unittest.mock import patch


@pytest.mark.skip_missing_tokenizer()
@patch('llama_recipes.finetuning.train')
@patch('llama_recipes.finetuning.LlamaTokenizer')
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
Expand All @@ -17,7 +19,7 @@ def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,

BATCH_SIZE = 8
kwargs = {
"model_name": "decapoda-research/llama-7b-hf",
"model_name": "meta-llama/Llama-2-7b-hf",
"batch_size_training": BATCH_SIZE,
"val_batch_size": 1,
"use_peft": False,
Expand Down Expand Up @@ -46,7 +48,7 @@ def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
assert "attention_mask" in batch.keys()

assert batch["labels"][0][268] == -100
assert batch["labels"][0][269] == 22291
assert batch["labels"][0][269] == 319

assert batch["input_ids"][0][0] == 1
assert batch["labels"][0][-1] == 2
Expand Down
6 changes: 4 additions & 2 deletions tests/test_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import patch


@pytest.mark.skip_missing_tokenizer()
@patch('llama_recipes.finetuning.train')
@patch('llama_recipes.finetuning.LlamaTokenizer')
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
Expand All @@ -16,7 +17,7 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, mocker, setup_
setup_tokenizer(tokenizer)

kwargs = {
"model_name": "decapoda-research/llama-7b-hf",
"model_name": "meta-llama/Llama-2-7b-hf",
"batch_size_training": 8,
"val_batch_size": 1,
"use_peft": False,
Expand Down Expand Up @@ -46,6 +47,7 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, mocker, setup_
assert batch["attention_mask"][0].size(0) == 4096


@pytest.mark.skip_missing_tokenizer()
@patch('llama_recipes.finetuning.train')
@patch('llama_recipes.finetuning.LlamaTokenizer')
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
Expand All @@ -69,7 +71,7 @@ def test_distributed_packing(dist, is_initialized, fsdp, setup, step_lr, optimiz
os.environ['MASTER_PORT'] = '12345'

kwargs = {
"model_name": "decapoda-research/llama-7b-hf",
"model_name": "meta-llama/Llama-2-7b-hf",
"batch_size_training": 8,
"val_batch_size": 1,
"use_peft": False,
Expand Down

0 comments on commit 0022d97

Please sign in to comment.