From 92a4325beae94e466196934d5ac4b1235b423054 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Mon, 14 Oct 2024 13:37:01 +0000 Subject: [PATCH 1/9] guard fused kernels with mark.forked --- tests/model/test_fused_kernels.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py index 125eb6c52..ce48390bc 100644 --- a/tests/model/test_fused_kernels.py +++ b/tests/model/test_fused_kernels.py @@ -30,6 +30,7 @@ ) +@pytest.mark.forked @pytest.mark.xfail(reason="SystemExit: None") def test_load_fused_kernels(): load() @@ -45,6 +46,7 @@ def test_load_fused_kernels(): raise e +@pytest.mark.forked @pytest.mark.xfail(reason="SystemExit: None") def test_fused_softmax(): load() @@ -148,6 +150,7 @@ def test_fused_softmax(): ) +@pytest.mark.forked @pytest.mark.xfail(reason="SystemExit: None") def test_fused_upper_triangle_mask_softmax(): load() From b66d141c1693d250ac8a72430be69c5e4067af13 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Mon, 14 Oct 2024 13:55:54 +0000 Subject: [PATCH 2/9] remove explicit skips due to cuda issue --- tests/model/test_model_generation.py | 1 - tests/model/test_model_train.py | 4 ---- tests/unit/test_format_conversion_scripts.py | 3 --- tests/unit/test_launcher_scripts.py | 18 ++++++------------ 4 files changed, 6 insertions(+), 20 deletions(-) diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py index 6dd93f355..4d64bedf7 100644 --- a/tests/model/test_model_generation.py +++ b/tests/model/test_model_generation.py @@ -64,7 +64,6 @@ ) -@pytest.mark.skip @pytest.mark.parametrize("param_dict", parameters, ids=names) def test_train(param_dict): t1 = run_generate_test_class() diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py index 65adfcdee..d05b00650 100644 --- a/tests/model/test_model_train.py +++ b/tests/model/test_model_train.py @@ -48,10 +48,6 @@ keys_to_test = PARAMS_TO_TEST.keys() -# TODO: fix model training tests -@pytest.mark.skip( - reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." -) @pytest.mark.parametrize( "key, value", [(key, value) for key in keys_to_test for value in PARAMS_TO_TEST[key]], diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py index 6935e480a..93d0fc380 100644 --- a/tests/unit/test_format_conversion_scripts.py +++ b/tests/unit/test_format_conversion_scripts.py @@ -4,9 +4,6 @@ from megatron.neox_arguments.neox_args import NeoXArgsTokenizer -@pytest.mark.skip( - reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue." -) def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path): # Generate random GPT-NEOX model, check we can convert to hf format diff --git a/tests/unit/test_launcher_scripts.py b/tests/unit/test_launcher_scripts.py index bdc38f111..a7f96b21a 100644 --- a/tests/unit/test_launcher_scripts.py +++ b/tests/unit/test_launcher_scripts.py @@ -56,9 +56,6 @@ def test_preprocess_data(tokenizer_type): preprocess_data.main(input_args) -@pytest.mark.skip( - reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." -) def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file): model_dir = str(tmpdir) sample_output_file = str(tmp_path) + ".txt" @@ -75,9 +72,6 @@ def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file): generate.main(input_args=deepspeed_main_args, overwrite_values=generate_args) -@pytest.mark.skip( - reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." -) def test_evaluate(monkeypatch, tmpdir, tmp_path): model_dir = str(tmpdir) sample_output_file = str(tmp_path) @@ -94,9 +88,9 @@ def test_evaluate(monkeypatch, tmpdir, tmp_path): eval.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args) -@pytest.mark.skip( - reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." -) +#@pytest.mark.skip( +# reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." +#) def test_finetuning(monkeypatch, tmpdir, tmp_path): # Save random model, load random model, keep training # TODO: add mocking to check that we're not ignoring the previously loaded model @@ -111,9 +105,9 @@ def test_finetuning(monkeypatch, tmpdir, tmp_path): train.main(input_args=deepspeed_main_args, overwrite_values=finetune_args) -@pytest.mark.skip( - reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." -) +#@pytest.mark.skip( +# reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue." +#) def test_train_launcher(monkeypatch): input_args = ["train.py", "tests/config/test_setup.yml"] deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args) From 456c45d0a37ae0f57fc1c0f74fe6d65ed3e37b49 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Mon, 14 Oct 2024 14:01:43 +0000 Subject: [PATCH 3/9] remove last unnessesary skip --- tests/model/test_model_checkpoint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py index 96f51683b..1c3728894 100644 --- a/tests/model/test_model_checkpoint.py +++ b/tests/model/test_model_checkpoint.py @@ -65,7 +65,6 @@ ) -@pytest.mark.skip @pytest.mark.parametrize("param_dict", parameters, ids=names) def test_train(param_dict): import tempfile From f7eee21d4180408fadc4ad80f5e0f5c12c0d86be Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Mon, 14 Oct 2024 14:35:27 +0000 Subject: [PATCH 4/9] pass conversion test --- tools/ckpts/convert_neox_to_hf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py index 8dfe02d54..ae480dd2d 100644 --- a/tools/ckpts/convert_neox_to_hf.py +++ b/tools/ckpts/convert_neox_to_hf.py @@ -444,10 +444,12 @@ def reshard_and_split_qkv( def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential): """Determine whether the checkpoint uses the legacy or new MLP naming convention.""" - print(list(loaded_tp_ranks[0]["module"].keys())) + for state_dict in loaded_tp_ranks: + print("------------------------------") + print(state_dict.keys()) if any( [ - ["mlp.linear1.weight" in key for key in list(state_dict["module"].keys())] + ["mlp.linear1.weight" in key for key in list(state_dict.keys())] for state_dict in loaded_tp_ranks ] ): @@ -456,7 +458,7 @@ def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential): [ [ "mlp.dense_h_to_4h.weight" in key - for key in list(state_dict["module"].keys()) + for key in list(state_dict.keys()) ] for state_dict in loaded_tp_ranks ] From f54e0e40b72f0d6d0eab0798f94469cfcd24e9c5 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Mon, 14 Oct 2024 18:15:43 +0000 Subject: [PATCH 5/9] fix bugs --- megatron/neox_arguments/arguments.py | 5 ++++- tests/common.py | 2 +- tests/model/test_model_generation.py | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index fa475c057..85daeb51e 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -59,6 +59,7 @@ OKAY = f"{GREEN}[OKAY]{END}" WARNING = f"{YELLOW}[WARNING]{END}" FAIL = f"{RED}[FAIL]{END}" +ERROR = f"{RED}[ERROR]{END}" INFO = "[INFO]" # ZERO defaults by deespeed @@ -875,16 +876,17 @@ def calculate_derived(self): """ Derives additional configuration values necessary for training from the current config """ - # number of gpus # Get number of GPUs param or hostfile to determine train_batch_size global_num_gpus = getattr(self, "global_num_gpus", None) if global_num_gpus is None: if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE): hostfile_path = self.hostfile or DLTS_HOSTFILE + print(hostfile_path, self.include, self.exclude) resources = obtain_resource_pool( hostfile_path, self.include or "", self.exclude or "" ) + print(resources) if self.num_nodes is not None and self.num_nodes > 0: resources = { k: resources[k] @@ -896,6 +898,7 @@ def calculate_derived(self): else: global_num_gpus = torch.cuda.device_count() self.update_value("global_num_gpus", global_num_gpus) + logging.info( self.__class__.__name__ diff --git a/tests/common.py b/tests/common.py index c63ced0f7..e3e2b5473 100644 --- a/tests/common.py +++ b/tests/common.py @@ -476,7 +476,7 @@ def get_test_path(filename): def model_setup(yaml_list=None, param_dict=None, clear_data=True): from megatron.neox_arguments import NeoXArgs from megatron.mpu import destroy_model_parallel - from megatron import initialize_megatron + from megatron.initialize import initialize_megatron from megatron.training import setup_model_and_optimizer destroy_model_parallel() # mpu model parallel contains remaining global vars diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py index 4d64bedf7..e4c8a87c1 100644 --- a/tests/model/test_model_generation.py +++ b/tests/model/test_model_generation.py @@ -25,6 +25,7 @@ from tests.common import DistributedTest, model_setup, parametrize PARAMS_TO_TEST = { + "include":["localhost:0,1"], "pipe_parallel_size,model_parallel_size,world_size": [ [0, 1, 1], [0, 1, 2], @@ -73,7 +74,7 @@ def test_train(param_dict): class run_generate_test_class(DistributedTest): world_size = 2 - def run_generate_test(param_dict, prompt): + def run_generate_test(self, param_dict, prompt): from megatron.text_generation_utils import generate_samples_from_prompt from megatron.utils import is_mp_rank_0 From 9e60eecee9cc0fac810546ee98dd1c81c453d9d8 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Thu, 17 Oct 2024 15:18:33 +0000 Subject: [PATCH 6/9] remove print statements --- megatron/neox_arguments/arguments.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 85daeb51e..5960ca232 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -882,11 +882,9 @@ def calculate_derived(self): if global_num_gpus is None: if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE): hostfile_path = self.hostfile or DLTS_HOSTFILE - print(hostfile_path, self.include, self.exclude) resources = obtain_resource_pool( hostfile_path, self.include or "", self.exclude or "" ) - print(resources) if self.num_nodes is not None and self.num_nodes > 0: resources = { k: resources[k] From 1745ffb50b80caab9526d9fd9b6596e1bc9789eb Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Thu, 17 Oct 2024 16:47:10 +0000 Subject: [PATCH 7/9] progress? --- tests/common.py | 15 +++++++++++---- tests/conftest.py | 5 +++++ tests/model/test_model_generation.py | 17 +++++++---------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/tests/common.py b/tests/common.py index e3e2b5473..fc41a8db9 100644 --- a/tests/common.py +++ b/tests/common.py @@ -16,6 +16,8 @@ import time import shutil import itertools +import inspect +import subprocess from pathlib import Path from abc import ABC, abstractmethod from deepspeed.accelerator import get_accelerator @@ -48,6 +50,14 @@ DEEPSPEED_UNIT_WORKER_TIMEOUT = 120 DEEPSPEED_TEST_TIMEOUT = 600 +def is_rocm_pytorch(): + """ + Check if the current PyTorch installation is using ROCm. + + Returns: + bool: True if PyTorch is using ROCm, False otherwise. + """ + return hasattr(torch.version, 'hip') and torch.version.hip is not None def get_xdist_worker_id(): xdist_worker = os.environ.get("PYTEST_XDIST_WORKER", None) @@ -67,7 +77,6 @@ def get_master_port(): _num_gpus = None - def set_accelerator_visible(): cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None) xdist_worker_id = get_xdist_worker_id() @@ -428,9 +437,7 @@ def test_2(self, val1, val2, val3, val4): assert int(os.environ["WORLD_SIZE"]) == 1 assert all(val1, val2, val3, val4) """ - - def __init__(self): - self.is_dist_test = True + is_dist_test = True # Temporary directory that is shared among test methods in a class @pytest.fixture(autouse=True, scope="class") diff --git a/tests/conftest.py b/tests/conftest.py index 917dd8543..234590080 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -68,6 +68,11 @@ def check_environment(pytestconfig): @pytest.hookimpl(tryfirst=True) def pytest_runtest_call(item): # We want to use our own launching function for distributed tests + print("-------------------------------------------------------------------------") + print(type(item)) + func_name = item.function.__name__ if hasattr(item, 'function') else None + print(f"Function name: {func_name}") + print("-------------------------------------------------------------------------") if getattr(item.cls, "is_dist_test", False): dist_test_class = item.cls() dist_test_class(item._request) diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py index e4c8a87c1..ce7cd064f 100644 --- a/tests/model/test_model_generation.py +++ b/tests/model/test_model_generation.py @@ -64,17 +64,11 @@ PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None ) - -@pytest.mark.parametrize("param_dict", parameters, ids=names) -def test_train(param_dict): - t1 = run_generate_test_class() - t1.run_generate_test(param_dict, param_dict.pop("prompt")) - - -class run_generate_test_class(DistributedTest): +class TestModelGeneration(DistributedTest): world_size = 2 - def run_generate_test(self, param_dict, prompt): + @pytest.mark.parametrize("param_dict", parameters, ids=names) + def test_generate(self, param_dict, tmpdir): from megatron.text_generation_utils import generate_samples_from_prompt from megatron.utils import is_mp_rank_0 @@ -89,10 +83,10 @@ def run_generate_test(self, param_dict, prompt): } param_dict.update(fixed_params) - # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True) model.eval() + prompt = param_dict.pop("prompt") prompts = [prompt for _ in range(args_loaded.num_samples)] output = generate_samples_from_prompt( neox_args=args_loaded, @@ -111,3 +105,6 @@ def run_generate_test(self, param_dict, prompt): for prompt, out in zip(prompts, output): assert prompt == out["context"] assert len(out["text"]) > 0 + + # Clean up + del model From afe2c405f8b9ebc36ba434251264ad4eada002fd Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Thu, 17 Oct 2024 20:35:13 +0000 Subject: [PATCH 8/9] consistent failure --- megatron/logging.py | 54 +++++++++++++++++++++++++ tests/common.py | 9 ++--- tests/conftest.py | 5 --- tests/model/test_model_checkpoint.py | 49 ++++++++-------------- tests/model/test_model_generation.py | 2 +- tests/model/test_model_instantiation.py | 2 +- 6 files changed, 76 insertions(+), 45 deletions(-) diff --git a/megatron/logging.py b/megatron/logging.py index af8a41fe5..37c96e125 100644 --- a/megatron/logging.py +++ b/megatron/logging.py @@ -13,6 +13,7 @@ # limitations under the License. import sys +import os import torch @@ -26,6 +27,7 @@ import math +''' class Tee: """Duplicate output to both stdout/err and file""" @@ -61,6 +63,58 @@ def flush(self) -> None: self.file.flush() except OSError: pass +''' + +class Tee: + """Duplicate output to both stdout/err and file""" + + def __init__(self, file, err: bool = False) -> None: + self.err = err + self.std = sys.stderr if err else sys.stdout + + if isinstance(file, str): + try: + # Ensure the directory exists if file is a path + os.makedirs(os.path.dirname(file), exist_ok=True) + self.file = open(file, "w") + except IOError as e: + print(f"Warning: Could not open file {file} for writing. {str(e)}", file=self.std) + self.file = None + elif hasattr(file, 'write') and hasattr(file, 'flush'): + # If it's a file-like object, use it directly + self.file = file + else: + raise ValueError("'file' must be either a file path or a file-like object") + + if not err: + sys.stdout = self + else: + sys.stderr = self + + def __del__(self) -> None: + if not self.err: + sys.stdout = self.std + else: + sys.stderr = self.std + + if self.file and hasattr(self.file, 'close'): + self.file.close() + + def write(self, data) -> None: + self.std.write(data) + if self.file: + try: + self.file.write(data) + except IOError as e: + print(f"Warning: Could not write to file. {str(e)}", file=self.std) + + def flush(self) -> None: + self.std.flush() + if self.file: + try: + self.file.flush() + except IOError as e: + print(f"Warning: Could not flush file. {str(e)}", file=self.std) def human_readable_flops(num) -> str: diff --git a/tests/common.py b/tests/common.py index fc41a8db9..4c8b6787b 100644 --- a/tests/common.py +++ b/tests/common.py @@ -132,8 +132,6 @@ def set_accelerator_visible(): def count_gpus(): global _num_gpus if _num_gpus is None: - import subprocess - nvidia_smi = subprocess.check_output(["nvidia-smi", "--list-gpus"]) _num_gpus = len(nvidia_smi.decode("utf-8").strip().split("\n")) return _num_gpus @@ -146,8 +144,6 @@ def set_cuda_visibile(): xdist_worker_id = 0 if cuda_visible is None: # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead - import subprocess - nvidia_smi = subprocess.check_output(["nvidia-smi", "--list-gpus"]) num_gpus = len(nvidia_smi.decode("utf-8").strip().split("\n")) cuda_visible = ",".join(map(str, range(num_gpus))) @@ -516,10 +512,11 @@ def model_setup(yaml_list=None, param_dict=None, clear_data=True): args_loaded.build_tokenizer() initialize_megatron(neox_args=args_loaded) - model, optimizer, lr_scheduler = setup_model_and_optimizer( + print("YAP") + model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer( neox_args=args_loaded, use_cache=True ) - return model, optimizer, lr_scheduler, args_loaded + return model, optimizer, lr_scheduler, reference_model, args_loaded def simulate_deepy_env(monkeypatch, input_args): diff --git a/tests/conftest.py b/tests/conftest.py index 234590080..917dd8543 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -68,11 +68,6 @@ def check_environment(pytestconfig): @pytest.hookimpl(tryfirst=True) def pytest_runtest_call(item): # We want to use our own launching function for distributed tests - print("-------------------------------------------------------------------------") - print(type(item)) - func_name = item.function.__name__ if hasattr(item, 'function') else None - print(f"Function name: {func_name}") - print("-------------------------------------------------------------------------") if getattr(item.cls, "is_dist_test", False): dist_test_class = item.cls() dist_test_class(item._request) diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py index 1c3728894..43bc05240 100644 --- a/tests/model/test_model_checkpoint.py +++ b/tests/model/test_model_checkpoint.py @@ -33,7 +33,8 @@ import torch PARAMS_TO_TEST = { - "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2], [2, 1]], + "include":["localhost:0,1"], + "pipe_parallel_size,model_parallel_size": [[1, 2], [0, 2], [2, 1]], "checkpoint_validation_with_forward_pass": [True], "fp16,fp32_allreduce": [ [ @@ -61,30 +62,22 @@ } parameters, names = parametrize( - PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None + PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=42 ) +class TestModelCheckpoint(DistributedTest): + world_size = 2 -@pytest.mark.parametrize("param_dict", parameters, ids=names) -def test_train(param_dict): - import tempfile - - d = tempfile.mkdtemp() - param_dict["save"] = d - - t1 = test_run_checkpoint_test_class() - t1.run_checkpoint_test(param_dict=param_dict) - - -class test_run_checkpoint_test_class(DistributedTest): - def run_checkpoint_test(yaml_list=None, param_dict=None): - + @pytest.mark.parametrize("param_dict", parameters, ids=names) + def test_checkpoint(self, param_dict, tmpdir): from megatron.checkpointing import load_checkpoint from megatron.checkpointing import save_checkpoint + print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB") - model, optimizer, lr_scheduler, args_loaded = model_setup( - yaml_list, param_dict, clear_data=True + model, optimizer, lr_scheduler, reference_model, args_loaded = model_setup( + yaml_list=None, param_dict=param_dict, clear_data=True ) + print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC") # save model checkpoint save_checkpoint( @@ -101,7 +94,7 @@ def run_checkpoint_test(yaml_list=None, param_dict=None): reloaded_optimizer, reloaded_lr_scheduler, args_reloaded, - ) = model_setup(yaml_list, param_dict, clear_data=False) + ) = model_setup(yaml_list=None, param_dict=param_dict, clear_data=False) iteration = load_checkpoint( neox_args=args_reloaded, model=reloaded_model, @@ -110,9 +103,7 @@ def run_checkpoint_test(yaml_list=None, param_dict=None): ) # ensure same checkpoint is loaded - assert ( - iteration == 42 - ), "run_checkpoint_test() iteration loaded from checkpoint correct" + assert iteration == 42, "Iteration loaded from checkpoint is incorrect" # check all weight groups are the same for idx, ((n1, p1), (n2, p2)) in enumerate( @@ -122,14 +113,8 @@ def run_checkpoint_test(yaml_list=None, param_dict=None): ) ): assert n1 == n2 - params_equal = (p1 == p2).all().item() - assert params_equal, "run_checkpoint_test() params equal: " + str(n1) - + params_equal = torch.all(p1 == p2).item() + assert params_equal, f"Parameters not equal: {n1}" -if __name__ == "__main__": - params = list( - parametrize( - PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None - ) - ) - test_train(params[0]) + # Clean up + del model, reloaded_model \ No newline at end of file diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py index ce7cd064f..093c174c3 100644 --- a/tests/model/test_model_generation.py +++ b/tests/model/test_model_generation.py @@ -83,7 +83,7 @@ def test_generate(self, param_dict, tmpdir): } param_dict.update(fixed_params) - model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True) + model, _, _, _, args_loaded = model_setup(None, param_dict, clear_data=True) model.eval() prompt = param_dict.pop("prompt") diff --git a/tests/model/test_model_instantiation.py b/tests/model/test_model_instantiation.py index 81c5cae4c..8adb70148 100644 --- a/tests/model/test_model_instantiation.py +++ b/tests/model/test_model_instantiation.py @@ -115,7 +115,7 @@ class test_instantiate_optimizers_class(DistributedTest): def run_test_model_instantiation(yaml_list=None, param_dict=None): from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine - model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict) + model, optimizer, lr_scheduler, reference_model, args_loaded = model_setup(yaml_list, param_dict) if args_loaded.pipe_parallel_size < 2: assert isinstance( model, DeepSpeedEngine From 5e4d9257f1b518e5a021ce4abbc63a1303e25f08 Mon Sep 17 00:00:00 2001 From: AI_WAIFU Date: Tue, 22 Oct 2024 19:54:59 +0000 Subject: [PATCH 9/9] wip --- megatron/checkpointing.py | 4 +++- tests/common.py | 1 - tests/model/test_model_checkpoint.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 1b6909c9f..97a79366a 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -96,7 +96,9 @@ def do_forward_pass(neox_args, model, inference=False): tokens, attention_mask, position_ids = get_batch( neox_args, context_tokens_tensor[:, : neox_args.seq_length] ) - logits = model((tokens, position_ids, attention_mask)) + output = model((tokens, position_ids, attention_mask)) + logits = output[0] if isinstance(output, tuple) else output + # reset to train mode, if model was in training before if model_was_in_train: diff --git a/tests/common.py b/tests/common.py index 4c8b6787b..893476a42 100644 --- a/tests/common.py +++ b/tests/common.py @@ -512,7 +512,6 @@ def model_setup(yaml_list=None, param_dict=None, clear_data=True): args_loaded.build_tokenizer() initialize_megatron(neox_args=args_loaded) - print("YAP") model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer( neox_args=args_loaded, use_cache=True ) diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py index 43bc05240..7bd108d61 100644 --- a/tests/model/test_model_checkpoint.py +++ b/tests/model/test_model_checkpoint.py @@ -93,6 +93,7 @@ def test_checkpoint(self, param_dict, tmpdir): reloaded_model, reloaded_optimizer, reloaded_lr_scheduler, + reloaded_reference_model, args_reloaded, ) = model_setup(yaml_list=None, param_dict=param_dict, clear_data=False) iteration = load_checkpoint(