From 92a4325beae94e466196934d5ac4b1235b423054 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 14 Oct 2024 13:37:01 +0000
Subject: [PATCH 1/9] guard fused kernels with mark.forked

---
 tests/model/test_fused_kernels.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py
index 125eb6c52..ce48390bc 100644
--- a/tests/model/test_fused_kernels.py
+++ b/tests/model/test_fused_kernels.py
@@ -30,6 +30,7 @@
 )
 
 
+@pytest.mark.forked
 @pytest.mark.xfail(reason="SystemExit: None")
 def test_load_fused_kernels():
     load()
@@ -45,6 +46,7 @@ def test_load_fused_kernels():
         raise e
 
 
+@pytest.mark.forked
 @pytest.mark.xfail(reason="SystemExit: None")
 def test_fused_softmax():
     load()
@@ -148,6 +150,7 @@ def test_fused_softmax():
         )
 
 
+@pytest.mark.forked
 @pytest.mark.xfail(reason="SystemExit: None")
 def test_fused_upper_triangle_mask_softmax():
     load()

From b66d141c1693d250ac8a72430be69c5e4067af13 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 14 Oct 2024 13:55:54 +0000
Subject: [PATCH 2/9] remove explicit skips due to cuda issue

---
 tests/model/test_model_generation.py         |  1 -
 tests/model/test_model_train.py              |  4 ----
 tests/unit/test_format_conversion_scripts.py |  3 ---
 tests/unit/test_launcher_scripts.py          | 18 ++++++------------
 4 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py
index 6dd93f355..4d64bedf7 100644
--- a/tests/model/test_model_generation.py
+++ b/tests/model/test_model_generation.py
@@ -64,7 +64,6 @@
 )
 
 
-@pytest.mark.skip
 @pytest.mark.parametrize("param_dict", parameters, ids=names)
 def test_train(param_dict):
     t1 = run_generate_test_class()
diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py
index 65adfcdee..d05b00650 100644
--- a/tests/model/test_model_train.py
+++ b/tests/model/test_model_train.py
@@ -48,10 +48,6 @@
 
 keys_to_test = PARAMS_TO_TEST.keys()
 
-# TODO: fix model training tests
-@pytest.mark.skip(
-    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
-)
 @pytest.mark.parametrize(
     "key, value",
     [(key, value) for key in keys_to_test for value in PARAMS_TO_TEST[key]],
diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index 6935e480a..93d0fc380 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -4,9 +4,6 @@
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
 
-@pytest.mark.skip(
-    reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue."
-)
 def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
     # Generate random GPT-NEOX model, check we can convert to hf format
 
diff --git a/tests/unit/test_launcher_scripts.py b/tests/unit/test_launcher_scripts.py
index bdc38f111..a7f96b21a 100644
--- a/tests/unit/test_launcher_scripts.py
+++ b/tests/unit/test_launcher_scripts.py
@@ -56,9 +56,6 @@ def test_preprocess_data(tokenizer_type):
     preprocess_data.main(input_args)
 
 
-@pytest.mark.skip(
-    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
-)
 def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file):
     model_dir = str(tmpdir)
     sample_output_file = str(tmp_path) + ".txt"
@@ -75,9 +72,6 @@ def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file):
     generate.main(input_args=deepspeed_main_args, overwrite_values=generate_args)
 
 
-@pytest.mark.skip(
-    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
-)
 def test_evaluate(monkeypatch, tmpdir, tmp_path):
     model_dir = str(tmpdir)
     sample_output_file = str(tmp_path)
@@ -94,9 +88,9 @@ def test_evaluate(monkeypatch, tmpdir, tmp_path):
     eval.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args)
 
 
-@pytest.mark.skip(
-    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
-)
+#@pytest.mark.skip(
+#    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+#)
 def test_finetuning(monkeypatch, tmpdir, tmp_path):
     # Save random model, load random model, keep training
     # TODO: add mocking to check that we're not ignoring the previously loaded model
@@ -111,9 +105,9 @@ def test_finetuning(monkeypatch, tmpdir, tmp_path):
     train.main(input_args=deepspeed_main_args, overwrite_values=finetune_args)
 
 
-@pytest.mark.skip(
-    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
-)
+#@pytest.mark.skip(
+#    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+#)
 def test_train_launcher(monkeypatch):
     input_args = ["train.py", "tests/config/test_setup.yml"]
     deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)

From 456c45d0a37ae0f57fc1c0f74fe6d65ed3e37b49 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 14 Oct 2024 14:01:43 +0000
Subject: [PATCH 3/9] remove last unnessesary skip

---
 tests/model/test_model_checkpoint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
index 96f51683b..1c3728894 100644
--- a/tests/model/test_model_checkpoint.py
+++ b/tests/model/test_model_checkpoint.py
@@ -65,7 +65,6 @@
 )
 
 
-@pytest.mark.skip
 @pytest.mark.parametrize("param_dict", parameters, ids=names)
 def test_train(param_dict):
     import tempfile

From f7eee21d4180408fadc4ad80f5e0f5c12c0d86be Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 14 Oct 2024 14:35:27 +0000
Subject: [PATCH 4/9] pass conversion test

---
 tools/ckpts/convert_neox_to_hf.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
index 8dfe02d54..ae480dd2d 100644
--- a/tools/ckpts/convert_neox_to_hf.py
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -444,10 +444,12 @@ def reshard_and_split_qkv(
 
 def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential):
     """Determine whether the checkpoint uses the legacy or new MLP naming convention."""
-    print(list(loaded_tp_ranks[0]["module"].keys()))
+    for state_dict in loaded_tp_ranks:
+        print("------------------------------")
+        print(state_dict.keys())
     if any(
         [
-            ["mlp.linear1.weight" in key for key in list(state_dict["module"].keys())]
+            ["mlp.linear1.weight" in key for key in list(state_dict.keys())]
             for state_dict in loaded_tp_ranks
         ]
     ):
@@ -456,7 +458,7 @@ def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential):
         [
             [
                 "mlp.dense_h_to_4h.weight" in key
-                for key in list(state_dict["module"].keys())
+                for key in list(state_dict.keys())
             ]
             for state_dict in loaded_tp_ranks
         ]

From f54e0e40b72f0d6d0eab0798f94469cfcd24e9c5 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 14 Oct 2024 18:15:43 +0000
Subject: [PATCH 5/9] fix bugs

---
 megatron/neox_arguments/arguments.py | 5 ++++-
 tests/common.py                      | 2 +-
 tests/model/test_model_generation.py | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index fa475c057..85daeb51e 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -59,6 +59,7 @@
 OKAY = f"{GREEN}[OKAY]{END}"
 WARNING = f"{YELLOW}[WARNING]{END}"
 FAIL = f"{RED}[FAIL]{END}"
+ERROR = f"{RED}[ERROR]{END}"
 INFO = "[INFO]"
 
 # ZERO defaults by deespeed
@@ -875,16 +876,17 @@ def calculate_derived(self):
         """
         Derives additional configuration values necessary for training from the current config
         """
-
         # number of gpus
         # Get number of GPUs param or hostfile to determine train_batch_size
         global_num_gpus = getattr(self, "global_num_gpus", None)
         if global_num_gpus is None:
             if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE):
                 hostfile_path = self.hostfile or DLTS_HOSTFILE
+                print(hostfile_path, self.include, self.exclude)
                 resources = obtain_resource_pool(
                     hostfile_path, self.include or "", self.exclude or ""
                 )
+                print(resources)
                 if self.num_nodes is not None and self.num_nodes > 0:
                     resources = {
                         k: resources[k]
@@ -896,6 +898,7 @@ def calculate_derived(self):
             else:
                 global_num_gpus = torch.cuda.device_count()
             self.update_value("global_num_gpus", global_num_gpus)
+        
 
         logging.info(
             self.__class__.__name__
diff --git a/tests/common.py b/tests/common.py
index c63ced0f7..e3e2b5473 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -476,7 +476,7 @@ def get_test_path(filename):
 def model_setup(yaml_list=None, param_dict=None, clear_data=True):
     from megatron.neox_arguments import NeoXArgs
     from megatron.mpu import destroy_model_parallel
-    from megatron import initialize_megatron
+    from megatron.initialize import initialize_megatron
     from megatron.training import setup_model_and_optimizer
 
     destroy_model_parallel()  # mpu model parallel contains remaining global vars
diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py
index 4d64bedf7..e4c8a87c1 100644
--- a/tests/model/test_model_generation.py
+++ b/tests/model/test_model_generation.py
@@ -25,6 +25,7 @@
 from tests.common import DistributedTest, model_setup, parametrize
 
 PARAMS_TO_TEST = {
+    "include":["localhost:0,1"],
     "pipe_parallel_size,model_parallel_size,world_size": [
         [0, 1, 1],
         [0, 1, 2],
@@ -73,7 +74,7 @@ def test_train(param_dict):
 class run_generate_test_class(DistributedTest):
     world_size = 2
 
-    def run_generate_test(param_dict, prompt):
+    def run_generate_test(self, param_dict, prompt):
         from megatron.text_generation_utils import generate_samples_from_prompt
         from megatron.utils import is_mp_rank_0
 

From 9e60eecee9cc0fac810546ee98dd1c81c453d9d8 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Thu, 17 Oct 2024 15:18:33 +0000
Subject: [PATCH 6/9] remove print statements

---
 megatron/neox_arguments/arguments.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 85daeb51e..5960ca232 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -882,11 +882,9 @@ def calculate_derived(self):
         if global_num_gpus is None:
             if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE):
                 hostfile_path = self.hostfile or DLTS_HOSTFILE
-                print(hostfile_path, self.include, self.exclude)
                 resources = obtain_resource_pool(
                     hostfile_path, self.include or "", self.exclude or ""
                 )
-                print(resources)
                 if self.num_nodes is not None and self.num_nodes > 0:
                     resources = {
                         k: resources[k]

From 1745ffb50b80caab9526d9fd9b6596e1bc9789eb Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Thu, 17 Oct 2024 16:47:10 +0000
Subject: [PATCH 7/9] progress?

---
 tests/common.py                      | 15 +++++++++++----
 tests/conftest.py                    |  5 +++++
 tests/model/test_model_generation.py | 17 +++++++----------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/tests/common.py b/tests/common.py
index e3e2b5473..fc41a8db9 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -16,6 +16,8 @@
 import time
 import shutil
 import itertools
+import inspect
+import subprocess
 from pathlib import Path
 from abc import ABC, abstractmethod
 from deepspeed.accelerator import get_accelerator
@@ -48,6 +50,14 @@
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
 DEEPSPEED_TEST_TIMEOUT = 600
 
+def is_rocm_pytorch():
+    """
+    Check if the current PyTorch installation is using ROCm.
+    
+    Returns:
+        bool: True if PyTorch is using ROCm, False otherwise.
+    """
+    return hasattr(torch.version, 'hip') and torch.version.hip is not None
 
 def get_xdist_worker_id():
     xdist_worker = os.environ.get("PYTEST_XDIST_WORKER", None)
@@ -67,7 +77,6 @@ def get_master_port():
 
 _num_gpus = None
 
-
 def set_accelerator_visible():
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     xdist_worker_id = get_xdist_worker_id()
@@ -428,9 +437,7 @@ def test_2(self, val1, val2, val3, val4):
                 assert int(os.environ["WORLD_SIZE"]) == 1
                 assert all(val1, val2, val3, val4)
     """
-
-    def __init__(self):
-        self.is_dist_test = True
+    is_dist_test = True
 
     # Temporary directory that is shared among test methods in a class
     @pytest.fixture(autouse=True, scope="class")
diff --git a/tests/conftest.py b/tests/conftest.py
index 917dd8543..234590080 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -68,6 +68,11 @@ def check_environment(pytestconfig):
 @pytest.hookimpl(tryfirst=True)
 def pytest_runtest_call(item):
     # We want to use our own launching function for distributed tests
+    print("-------------------------------------------------------------------------")
+    print(type(item))
+    func_name = item.function.__name__ if hasattr(item, 'function') else None
+    print(f"Function name: {func_name}")
+    print("-------------------------------------------------------------------------")
     if getattr(item.cls, "is_dist_test", False):
         dist_test_class = item.cls()
         dist_test_class(item._request)
diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py
index e4c8a87c1..ce7cd064f 100644
--- a/tests/model/test_model_generation.py
+++ b/tests/model/test_model_generation.py
@@ -64,17 +64,11 @@
     PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
 )
 
-
-@pytest.mark.parametrize("param_dict", parameters, ids=names)
-def test_train(param_dict):
-    t1 = run_generate_test_class()
-    t1.run_generate_test(param_dict, param_dict.pop("prompt"))
-
-
-class run_generate_test_class(DistributedTest):
+class TestModelGeneration(DistributedTest):
     world_size = 2
 
-    def run_generate_test(self, param_dict, prompt):
+    @pytest.mark.parametrize("param_dict", parameters, ids=names)
+    def test_generate(self, param_dict, tmpdir):
         from megatron.text_generation_utils import generate_samples_from_prompt
         from megatron.utils import is_mp_rank_0
 
@@ -89,10 +83,10 @@ def run_generate_test(self, param_dict, prompt):
         }
 
         param_dict.update(fixed_params)
-        # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
         model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
         model.eval()
 
+        prompt = param_dict.pop("prompt")
         prompts = [prompt for _ in range(args_loaded.num_samples)]
         output = generate_samples_from_prompt(
             neox_args=args_loaded,
@@ -111,3 +105,6 @@ def run_generate_test(self, param_dict, prompt):
             for prompt, out in zip(prompts, output):
                 assert prompt == out["context"]
                 assert len(out["text"]) > 0
+
+        # Clean up
+        del model

From afe2c405f8b9ebc36ba434251264ad4eada002fd Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Thu, 17 Oct 2024 20:35:13 +0000
Subject: [PATCH 8/9] consistent failure

---
 megatron/logging.py                     | 54 +++++++++++++++++++++++++
 tests/common.py                         |  9 ++---
 tests/conftest.py                       |  5 ---
 tests/model/test_model_checkpoint.py    | 49 ++++++++--------------
 tests/model/test_model_generation.py    |  2 +-
 tests/model/test_model_instantiation.py |  2 +-
 6 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/megatron/logging.py b/megatron/logging.py
index af8a41fe5..37c96e125 100644
--- a/megatron/logging.py
+++ b/megatron/logging.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import sys
+import os
 
 import torch
 
@@ -26,6 +27,7 @@
 import math
 
 
+'''
 class Tee:
     """Duplicate output to both stdout/err and file"""
 
@@ -61,6 +63,58 @@ def flush(self) -> None:
             self.file.flush()
         except OSError:
             pass
+'''
+
+class Tee:
+    """Duplicate output to both stdout/err and file"""
+
+    def __init__(self, file, err: bool = False) -> None:
+        self.err = err
+        self.std = sys.stderr if err else sys.stdout
+
+        if isinstance(file, str):
+            try:
+                # Ensure the directory exists if file is a path
+                os.makedirs(os.path.dirname(file), exist_ok=True)
+                self.file = open(file, "w")
+            except IOError as e:
+                print(f"Warning: Could not open file {file} for writing. {str(e)}", file=self.std)
+                self.file = None
+        elif hasattr(file, 'write') and hasattr(file, 'flush'):
+            # If it's a file-like object, use it directly
+            self.file = file
+        else:
+            raise ValueError("'file' must be either a file path or a file-like object")
+
+        if not err:
+            sys.stdout = self
+        else:
+            sys.stderr = self
+
+    def __del__(self) -> None:
+        if not self.err:
+            sys.stdout = self.std
+        else:
+            sys.stderr = self.std
+        
+        if self.file and hasattr(self.file, 'close'):
+            self.file.close()
+
+    def write(self, data) -> None:
+        self.std.write(data)
+        if self.file:
+            try:
+                self.file.write(data)
+            except IOError as e:
+                print(f"Warning: Could not write to file. {str(e)}", file=self.std)
+
+    def flush(self) -> None:
+        self.std.flush()
+        if self.file:
+            try:
+                self.file.flush()
+            except IOError as e:
+                print(f"Warning: Could not flush file. {str(e)}", file=self.std)
 
 
 def human_readable_flops(num) -> str:
diff --git a/tests/common.py b/tests/common.py
index fc41a8db9..4c8b6787b 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -132,8 +132,6 @@ def set_accelerator_visible():
 def count_gpus():
     global _num_gpus
     if _num_gpus is None:
-        import subprocess
-
         nvidia_smi = subprocess.check_output(["nvidia-smi", "--list-gpus"])
         _num_gpus = len(nvidia_smi.decode("utf-8").strip().split("\n"))
     return _num_gpus
@@ -146,8 +144,6 @@ def set_cuda_visibile():
         xdist_worker_id = 0
     if cuda_visible is None:
         # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
-        import subprocess
-
         nvidia_smi = subprocess.check_output(["nvidia-smi", "--list-gpus"])
         num_gpus = len(nvidia_smi.decode("utf-8").strip().split("\n"))
         cuda_visible = ",".join(map(str, range(num_gpus)))
@@ -516,10 +512,11 @@ def model_setup(yaml_list=None, param_dict=None, clear_data=True):
     args_loaded.build_tokenizer()
 
     initialize_megatron(neox_args=args_loaded)
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+    print("YAP")
+    model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
         neox_args=args_loaded, use_cache=True
     )
-    return model, optimizer, lr_scheduler, args_loaded
+    return model, optimizer, lr_scheduler, reference_model, args_loaded
 
 
 def simulate_deepy_env(monkeypatch, input_args):
diff --git a/tests/conftest.py b/tests/conftest.py
index 234590080..917dd8543 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -68,11 +68,6 @@ def check_environment(pytestconfig):
 @pytest.hookimpl(tryfirst=True)
 def pytest_runtest_call(item):
     # We want to use our own launching function for distributed tests
-    print("-------------------------------------------------------------------------")
-    print(type(item))
-    func_name = item.function.__name__ if hasattr(item, 'function') else None
-    print(f"Function name: {func_name}")
-    print("-------------------------------------------------------------------------")
     if getattr(item.cls, "is_dist_test", False):
         dist_test_class = item.cls()
         dist_test_class(item._request)
diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
index 1c3728894..43bc05240 100644
--- a/tests/model/test_model_checkpoint.py
+++ b/tests/model/test_model_checkpoint.py
@@ -33,7 +33,8 @@
 import torch
 
 PARAMS_TO_TEST = {
-    "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2], [2, 1]],
+    "include":["localhost:0,1"],
+    "pipe_parallel_size,model_parallel_size": [[1, 2], [0, 2], [2, 1]],
     "checkpoint_validation_with_forward_pass": [True],
     "fp16,fp32_allreduce": [
         [
@@ -61,30 +62,22 @@
 }
 
 parameters, names = parametrize(
-    PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
+    PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=42
 )
 
+class TestModelCheckpoint(DistributedTest):
+    world_size = 2
 
-@pytest.mark.parametrize("param_dict", parameters, ids=names)
-def test_train(param_dict):
-    import tempfile
-
-    d = tempfile.mkdtemp()
-    param_dict["save"] = d
-
-    t1 = test_run_checkpoint_test_class()
-    t1.run_checkpoint_test(param_dict=param_dict)
-
-
-class test_run_checkpoint_test_class(DistributedTest):
-    def run_checkpoint_test(yaml_list=None, param_dict=None):
-
+    @pytest.mark.parametrize("param_dict", parameters, ids=names)
+    def test_checkpoint(self, param_dict, tmpdir):
         from megatron.checkpointing import load_checkpoint
         from megatron.checkpointing import save_checkpoint
+        print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")
 
-        model, optimizer, lr_scheduler, args_loaded = model_setup(
-            yaml_list, param_dict, clear_data=True
+        model, optimizer, lr_scheduler, reference_model, args_loaded = model_setup(
+            yaml_list=None, param_dict=param_dict, clear_data=True
         )
+        print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
 
         # save model checkpoint
         save_checkpoint(
@@ -101,7 +94,7 @@ def run_checkpoint_test(yaml_list=None, param_dict=None):
             reloaded_optimizer,
             reloaded_lr_scheduler,
             args_reloaded,
-        ) = model_setup(yaml_list, param_dict, clear_data=False)
+        ) = model_setup(yaml_list=None, param_dict=param_dict, clear_data=False)
         iteration = load_checkpoint(
             neox_args=args_reloaded,
             model=reloaded_model,
@@ -110,9 +103,7 @@ def run_checkpoint_test(yaml_list=None, param_dict=None):
         )
 
         # ensure same checkpoint is loaded
-        assert (
-            iteration == 42
-        ), "run_checkpoint_test() iteration loaded from checkpoint correct"
+        assert iteration == 42, "Iteration loaded from checkpoint is incorrect"
 
         # check all weight groups are the same
         for idx, ((n1, p1), (n2, p2)) in enumerate(
@@ -122,14 +113,8 @@ def run_checkpoint_test(yaml_list=None, param_dict=None):
             )
         ):
             assert n1 == n2
-            params_equal = (p1 == p2).all().item()
-            assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
-
+            params_equal = torch.all(p1 == p2).item()
+            assert params_equal, f"Parameters not equal: {n1}"
 
-if __name__ == "__main__":
-    params = list(
-        parametrize(
-            PARAMS_TO_TEST, max_tests=int(os.getenv("MAX_TESTCASES", 50)), seed=None
-        )
-    )
-    test_train(params[0])
+        # Clean up
+        del model, reloaded_model
\ No newline at end of file
diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py
index ce7cd064f..093c174c3 100644
--- a/tests/model/test_model_generation.py
+++ b/tests/model/test_model_generation.py
@@ -83,7 +83,7 @@ def test_generate(self, param_dict, tmpdir):
         }
 
         param_dict.update(fixed_params)
-        model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
+        model, _, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
         model.eval()
 
         prompt = param_dict.pop("prompt")
diff --git a/tests/model/test_model_instantiation.py b/tests/model/test_model_instantiation.py
index 81c5cae4c..8adb70148 100644
--- a/tests/model/test_model_instantiation.py
+++ b/tests/model/test_model_instantiation.py
@@ -115,7 +115,7 @@ class test_instantiate_optimizers_class(DistributedTest):
     def run_test_model_instantiation(yaml_list=None, param_dict=None):
         from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine
 
-        model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
+        model, optimizer, lr_scheduler, reference_model, args_loaded = model_setup(yaml_list, param_dict)
         if args_loaded.pipe_parallel_size < 2:
             assert isinstance(
                 model, DeepSpeedEngine

From 5e4d9257f1b518e5a021ce4abbc63a1303e25f08 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Tue, 22 Oct 2024 19:54:59 +0000
Subject: [PATCH 9/9] wip

---
 megatron/checkpointing.py            | 4 +++-
 tests/common.py                      | 1 -
 tests/model/test_model_checkpoint.py | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 1b6909c9f..97a79366a 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -96,7 +96,9 @@ def do_forward_pass(neox_args, model, inference=False):
         tokens, attention_mask, position_ids = get_batch(
             neox_args, context_tokens_tensor[:, : neox_args.seq_length]
         )
-        logits = model((tokens, position_ids, attention_mask))
+        output = model((tokens, position_ids, attention_mask))
+        logits = output[0] if isinstance(output, tuple) else output
+
 
     # reset to train mode, if model was in training before
     if model_was_in_train:
diff --git a/tests/common.py b/tests/common.py
index 4c8b6787b..893476a42 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -512,7 +512,6 @@ def model_setup(yaml_list=None, param_dict=None, clear_data=True):
     args_loaded.build_tokenizer()
 
     initialize_megatron(neox_args=args_loaded)
-    print("YAP")
     model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
         neox_args=args_loaded, use_cache=True
     )
diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
index 43bc05240..7bd108d61 100644
--- a/tests/model/test_model_checkpoint.py
+++ b/tests/model/test_model_checkpoint.py
@@ -93,6 +93,7 @@ def test_checkpoint(self, param_dict, tmpdir):
             reloaded_model,
             reloaded_optimizer,
             reloaded_lr_scheduler,
+            reloaded_reference_model,
             args_reloaded,
         ) = model_setup(yaml_list=None, param_dict=param_dict, clear_data=False)
         iteration = load_checkpoint(