From 84fe6d9b36a00a21b06cacf9994cd63e76b3cd7b Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Fri, 31 Jan 2025 10:19:01 -0500
Subject: [PATCH 01/16] feat: nvidia dmc custom loader

---
 dyana/loaders/megatron/.gitignore       |   3 +
 dyana/loaders/megatron/Dockerfile       |  28 +++++
 dyana/loaders/megatron/main.py          | 153 ++++++++++++++++++++++++
 dyana/loaders/megatron/requirements.txt |  28 +++++
 dyana/loaders/megatron/settings.yml     |  29 +++++
 5 files changed, 241 insertions(+)
 create mode 100644 dyana/loaders/megatron/.gitignore
 create mode 100644 dyana/loaders/megatron/Dockerfile
 create mode 100644 dyana/loaders/megatron/main.py
 create mode 100644 dyana/loaders/megatron/requirements.txt
 create mode 100644 dyana/loaders/megatron/settings.yml

diff --git a/dyana/loaders/megatron/.gitignore b/dyana/loaders/megatron/.gitignore
new file mode 100644
index 0000000..3d0dd7e
--- /dev/null
+++ b/dyana/loaders/megatron/.gitignore
@@ -0,0 +1,3 @@
+dyana.py
+dyana-requirements.txt
+dyana-requirements-gpu.txt
\ No newline at end of file
diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
new file mode 100644
index 0000000..a487cff
--- /dev/null
+++ b/dyana/loaders/megatron/Dockerfile
@@ -0,0 +1,28 @@
+FROM nvcr.io/nvidia/pytorch:24.04-py3
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Megatron-LM dmc branch and install
+RUN git clone -b dmc https://github.com/NVIDIA/Megatron-LM.git && \
+    cd Megatron-LM && \
+    pip install -e .
+
+# Copy loader files
+COPY . .
+
+# Install requirements
+RUN pip install --no-cache-dir -r dyana-requirements-gpu.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Environment setup
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV PYTHONUNBUFFERED=1
+ENV MEGATRON_DEBUG=1
+
+ENTRYPOINT ["python3", "-u", "main.py"]
\ No newline at end of file
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
new file mode 100644
index 0000000..c98e289
--- /dev/null
+++ b/dyana/loaders/megatron/main.py
@@ -0,0 +1,153 @@
+import os
+import sys
+import torch
+from pathlib import Path
+from dyana import Profiler
+
+from megatron.core import parallel_state
+from megatron.core.models.gpt import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.training import get_args, get_model
+from megatron.training.arguments import parse_args, core_transformer_config_from_args
+from megatron.training.initialize import initialize_megatron
+from megatron.training.checkpointing import load_checkpoint
+from megatron.contrib.dmc import add_dmc_layer
+
+
+def setup_megatron_args(model_size: str, model_path: str, tokenizer_path: str):
+    """Setup Megatron arguments"""
+    print("Debug: Starting argument setup")
+    sys.argv = [sys.argv[0]]
+
+    args = [
+        "--tensor-model-parallel-size",
+        "1",
+        "--pipeline-model-parallel-size",
+        "1",
+        "--load",
+        model_path,
+        "--tokenizer-model",
+        tokenizer_path,
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--bf16",
+        "--seq-length",
+        "4096",
+        "--max-position-embeddings",
+        "4096",
+        "--num-layers",
+        "32" if model_size == "7B" else "40",
+        "--hidden-size",
+        "4096" if model_size == "7B" else "5120",
+        "--num-attention-heads",
+        "32" if model_size == "7B" else "40",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "1",
+        "--no-masked-softmax-fusion",
+        "--no-load-optim",
+        "--no-load-rng",
+        "--skip-train",
+        "--fp16",
+        "--use-cpu-initialization",  # avoid CUDA deadlocks
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+    ]
+
+    print("Debug: Setting sys.argv")
+    sys.argv.extend(args)
+
+    print("Debug: Parsing args")
+    args = parse_args()
+
+    print("Debug: Initializing Megatron")
+    initialize_megatron(args_defaults={"no_load_optim": True, "no_load_rng": True})
+
+    return get_args()
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Model provider for Megatron to load the model."""
+    print("Debug: Setting up model provider")
+    args = get_args()
+    config = core_transformer_config_from_args(args)
+
+    print("Debug: Creating model")
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+    )
+
+    return model
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--tokenizer", required=True)
+    parser.add_argument("--size", choices=["7B", "13B"], required=True)
+    parser.add_argument("--input", default="This is an example prompt.")
+    args = parser.parse_args()
+
+    profiler = Profiler(gpu=True)
+
+    try:
+        # verify files
+        model_path = Path(args.model)
+        tokenizer_path = Path(args.tokenizer)
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model not found at {model_path}")
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+
+        print("Debug: Starting initialization")
+        profiler.on_stage("initializing")
+
+        print("Debug: Setting up args")
+        args = setup_megatron_args(args.size, str(model_path), str(tokenizer_path))
+
+        print("Debug: Initializing model parallel")
+        torch.cuda.empty_cache()
+        parallel_state.set_tensor_model_parallel_world_size(1)
+        parallel_state.set_tensor_model_parallel_rank(0)
+
+        print("Debug: Creating model")
+        model = get_model(model_provider, wrap_with_ddp=False)
+
+        print("Debug: Loading checkpoint")
+        _ = load_checkpoint(model[0], None, None)
+        model = model[0].cuda()
+        model.eval()
+
+        print("Loading tokenizer...")
+        from transformers import LlamaTokenizer
+
+        tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path))
+
+        print("Starting inference...")
+        input_ids = tokenizer(args.input, return_tensors="pt").to("cuda")
+
+        with torch.no_grad():
+            output = model.generate(input_ids=input_ids["input_ids"], max_new_tokens=100, use_cache=True)
+            text = tokenizer.decode(output[0], skip_special_tokens=True)
+            profiler.track("output", text)
+            print(f"Generated text: {text}")
+
+        profiler.on_stage("complete")
+
+    except Exception as e:
+        print(f"Debug: Error occurred: {str(e)}")
+        print(f"Debug: Error type: {type(e)}")
+        import traceback
+
+        print(f"Debug: Traceback: {traceback.format_exc()}")
+        profiler.track_error("model", str(e))
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt
new file mode 100644
index 0000000..31b750a
--- /dev/null
+++ b/dyana/loaders/megatron/requirements.txt
@@ -0,0 +1,28 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+# Core dependencies
+torch>=2.1.0
+transformers>=4.31.0
+accelerate>=0.21.0
+psutil>=5.6.7
+
+# Megatron and model dependencies
+ninja
+sentencepiece==0.2.0
+tokenizers>=0.13.3
+transformer-engine>=1.3
+einops>=0.6.1
+evaluate
+scikit-learn
+flash-attn==2.6.1
+hydra_colorlog==1.2.0
+hydra-core==1.3.2
+nltk
+datasets
+
+# Dyana dependencies - using the base requirements
+rich>=10.0.0
+pydantic>=2.0.0
+pydantic-yaml>=1.0.0
+docker>=6.0.0
+psutil>=5.6.7
+nvidia-ml-py>=12.0.0
\ No newline at end of file
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
new file mode 100644
index 0000000..2aa2b75
--- /dev/null
+++ b/dyana/loaders/megatron/settings.yml
@@ -0,0 +1,29 @@
+description: Loads and profiles Megatron-LM DMC models for efficient inference
+
+build_args:
+  extra-requirements: EXTRA_REQUIREMENTS
+
+args:
+  - name: model
+    description: Path to Megatron model checkpoint
+    required: true
+    volume: true
+
+  - name: tokenizer
+    description: Path to Llama 2 tokenizer model
+    required: true
+    volume: true
+
+  - name: size
+    description: Model size (7B or 13B)
+    required: true
+    choices: ["7B", "13B"]
+
+  - name: input
+    description: Input text for inference
+    default: "This is an example prompt."
+    required: false
+
+examples:
+  - description: "Load a Megatron-DMC model with tokenizer:"
+    command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B

From 0d41168172681a6e54baa175457fc31825a0c039 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 3 Feb 2025 21:28:18 -0500
Subject: [PATCH 02/16] fix: try except err handling in main script

---
 dyana/loaders/megatron/Dockerfile       |  72 +++++++++-
 dyana/loaders/megatron/main.py          | 169 ++++++++++++++++++++++++
 dyana/loaders/megatron/requirements.txt |  22 ++-
 dyana/loaders/megatron/settings.yml     |   4 +
 4 files changed, 265 insertions(+), 2 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index a487cff..46a1964 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -6,6 +6,7 @@ WORKDIR /app
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     git \
+<<<<<<< Updated upstream
     && rm -rf /var/lib/apt/lists/*
 
 # Clone Megatron-LM dmc branch and install
@@ -25,4 +26,73 @@ ENV CUDA_DEVICE_MAX_CONNECTIONS=1
 ENV PYTHONUNBUFFERED=1
 ENV MEGATRON_DEBUG=1
 
-ENTRYPOINT ["python3", "-u", "main.py"]
\ No newline at end of file
+ENTRYPOINT ["python3", "-u", "main.py"]
+=======
+    ca-certificates \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Configure environment
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV CUDA_LAUNCH_BLOCKING=1
+ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
+ENV CUDA_MODULE_LOADING=LAZY
+ENV TORCH_USE_CUDA_DSA=1
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV NCCL_ASYNC_ERROR_HANDLING=1
+ENV OMP_NUM_THREADS=1
+ENV NVTE_FRAMEWORK=pytorch
+ENV MAX_JOBS=4
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TORCH_CUDNN_V8_API_ENABLED=1
+ENV TORCH_ALLOW_TF32=1
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+ENV PYTORCH_JIT=0
+ENV TORCH_COMPILE_DEBUG=1
+ENV TORCH_INDUCTOR_VAR_NAMES=1
+
+# Only verify PyTorch during build
+RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+
+# Create working directory
+RUN mkdir -p /app/workspace
+
+# Copy files in correct order
+COPY requirements.txt /app/workspace/
+COPY *.py /app/workspace/
+COPY dyana-requirements*.txt /app/workspace/
+
+WORKDIR /app/workspace
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install Megatron-LM
+RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /app/Megatron-LM && \
+    cd /app/Megatron-LM && \
+    pip install -e .
+
+ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH
+
+# Create simpler entrypoint script
+RUN printf '#!/bin/bash\n\
+    export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
+    export PYTORCH_NO_CUDA_MEMORY_CACHING=1\n\
+    exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
+    chmod +x /app/workspace/entrypoint.sh
+
+# Verify files exist and have correct permissions
+RUN ls -la /app/workspace && \
+    ls -la /app/workspace/entrypoint.sh && \
+    test -x /app/workspace/entrypoint.sh
+
+# Set proper ownership and permissions
+RUN chown -R root:root /app && \
+    chmod -R 755 /app && \
+    chmod +x /app/workspace/entrypoint.sh
+
+# Use full path in entrypoint
+ENTRYPOINT ["/app/workspace/entrypoint.sh"]
+>>>>>>> Stashed changes
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index c98e289..9e20978 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,9 +1,15 @@
+<<<<<<< Updated upstream
 import os
 import sys
+=======
+import argparse
+import os
+>>>>>>> Stashed changes
 import torch
 from pathlib import Path
 from dyana import Profiler
 
+<<<<<<< Updated upstream
 from megatron.core import parallel_state
 from megatron.core.models.gpt import GPTModel
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
@@ -151,3 +157,166 @@ def model_provider(pre_process=True, post_process=True):
         profiler.track_error("model", str(e))
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+=======
+
+def verify_cuda_setup():
+    """Verify CUDA and PyTorch setup before model loading"""
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+
+    # Disable JIT/Inductor
+    torch._C._jit_override_can_fuse_on_cpu(False)
+    torch._C._jit_override_can_fuse_on_gpu(False)
+    torch._C._jit_set_texpr_fuser_enabled(False)
+    torch._C._jit_set_nvfuser_enabled(False)
+
+    print("=== Runtime Configuration ===")
+    print(f"PyTorch: {torch.__version__}")
+    print(f"CUDA: {torch.version.cuda}")
+    print(f"Device: {torch.cuda.get_device_name()}")
+    print("===========================")
+
+    # Set default device
+    torch.cuda.set_device(0)
+
+
+if __name__ == "__main__":
+    # Initialize profiler first
+    profiler = Profiler(gpu=True)
+
+    try:
+        # Verify CUDA setup
+        verify_cuda_setup()
+        profiler.on_stage("cuda_verified")
+
+        os.environ["TE_VERBOSE"] = "1"
+        os.environ["NVTE_FRAMEWORK"] = "pytorch"
+        print("Starting Megatron loader with verbose logging...")
+
+        # initialize CUDA and Transformer
+        if torch.cuda.is_available():
+            import transformer_engine.pytorch as te
+
+            te.initialize()
+            print(f"Initialized Transformer Engine version: {te.__version__}")
+
+        # import Megatron dependencies
+        from megatron.core import parallel_state
+        from megatron.core.transformer.transformer_config import TransformerConfig
+        from transformers import LlamaTokenizer
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--model", required=True)
+        parser.add_argument("--tokenizer", required=True)
+        parser.add_argument("--size", choices=["7B", "13B"], required=True)
+        parser.add_argument("--input", default="This is an example prompt.")
+        args = parser.parse_args()
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        try:
+            print(f"Transformer Engine version: {transformer_engine.__version__}")
+            print(f"CUDA devices: {torch.cuda.device_count()}")
+            print(f"CUDA version: {torch.version.cuda}")
+            profiler.track(
+                "env_info",
+                {
+                    "te_version": transformer_engine.__version__,
+                    "cuda_devices": torch.cuda.device_count(),
+                    "cuda_version": torch.version.cuda,
+                },
+            )
+
+            model_path = Path(args.model)
+            tokenizer_path = Path(args.tokenizer)
+            if not model_path.exists():
+                raise FileNotFoundError(f"Model not found at {model_path}")
+            if not tokenizer_path.exists():
+                raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+
+            # Initialize Megatron's tensor parallel
+            world_size = torch.cuda.device_count()
+            parallel_state.initialize_model_parallel(
+                tensor_model_parallel_size=1,
+                pipeline_model_parallel_size=1,
+            )
+            profiler.on_stage("megatron_initialized")
+
+            # Model config based on size
+            model_config = {
+                "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
+                "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
+            }[args.size]
+
+            # Create Megatron transformer config
+            config = TransformerConfig(
+                num_layers=model_config["num_layers"],
+                hidden_size=model_config["hidden_size"],
+                num_attention_heads=model_config["num_attention_heads"],
+                max_position_embeddings=4096,
+                init_method_std=0.02,
+                use_scaled_init_method=True,
+                attention_softmax_in_fp32=True,
+                rotary_pct=0.25,  # LLaMA uses rotary embeddings
+            )
+            profiler.track("model_config", model_config)
+            profiler.on_stage("config_created")
+
+            try:
+                tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
+                profiler.on_stage("tokenizer_loaded")
+
+                model = GPTModel(
+                    config=config,
+                    vocab_size=tokenizer.vocab_size,
+                    max_sequence_length=4096,
+                    parallel_output=False,
+                    share_embeddings_and_output_weights=True,
+                )
+                profiler.on_stage("model_created")
+
+                # Load DMC checkpoint
+                checkpoint = torch.load(str(model_path), map_location=device)
+                model.load_state_dict(checkpoint)
+                model.cuda()
+                model.eval()
+                profiler.on_stage("model_loaded")
+
+                input_ids = tokenizer(args.input, return_tensors="pt").to(device)
+                with torch.no_grad():
+                    output = model(input_ids=input_ids["input_ids"])
+                    logits = output.logits
+                    next_token = torch.argmax(logits[:, -1, :], dim=-1)
+                    generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
+                    text = tokenizer.decode(generated[0], skip_special_tokens=True)
+                    profiler.track("output", text)
+                    profiler.on_stage("inference_complete")
+
+            except Exception as e:
+                profiler.track_error("model", str(e))
+                print(f"Model loading/inference failed: {e}")
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                raise
+
+        except Exception as e:
+            print(f"Error occurred: {str(e)}")
+            profiler.track_error("model", str(e))
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            raise
+
+    except Exception as e:
+        profiler.track_error("setup", str(e))
+        print(f"Setup error: {e}")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        raise
+
+    finally:
+        try:
+            parallel_state.destroy_model_parallel()
+        except Exception as e:
+            profiler.track_error("cleanup", str(e))
+            print(f"Cleanup error: {e}")
+>>>>>>> Stashed changes
diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt
index 31b750a..7bb4c24 100644
--- a/dyana/loaders/megatron/requirements.txt
+++ b/dyana/loaders/megatron/requirements.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
+<<<<<<< Updated upstream
 # Core dependencies
 torch>=2.1.0
 transformers>=4.31.0
@@ -25,4 +26,23 @@ pydantic>=2.0.0
 pydantic-yaml>=1.0.0
 docker>=6.0.0
 psutil>=5.6.7
-nvidia-ml-py>=12.0.0
\ No newline at end of file
+nvidia-ml-py>=12.0.0
+=======
+--find-links https://developer.download.nvidia.com/compute/redist
+
+# Base dependencies from Megatron core
+torch>=2.0.0
+packaging>=20.0
+typing_extensions>=4.0.0
+
+# Megatron DMC dependencies
+flash-attn==2.6.1
+sentencepiece==0.2.0
+hydra-core==1.3.2
+hydra_colorlog==1.2.0
+nltk
+datasets
+
+# Utilities
+psutil>=5.6.7
+>>>>>>> Stashed changes
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
index 2aa2b75..770a8d1 100644
--- a/dyana/loaders/megatron/settings.yml
+++ b/dyana/loaders/megatron/settings.yml
@@ -26,4 +26,8 @@ args:
 
 examples:
   - description: "Load a Megatron-DMC model with tokenizer:"
+<<<<<<< Updated upstream
     command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B
+=======
+    command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B --verbose
+>>>>>>> Stashed changes

From 1d43a42cbf2536d5489e026c39740944eccd9db1 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 3 Feb 2025 21:32:14 -0500
Subject: [PATCH 03/16] fix: merge conflicts

---
 dyana/loaders/megatron/Dockerfile       |  26 +---
 dyana/loaders/megatron/main.py          | 173 ++----------------------
 dyana/loaders/megatron/requirements.txt |  32 +----
 dyana/loaders/megatron/settings.yml     |   4 -
 4 files changed, 16 insertions(+), 219 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index 46a1964..80af966 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -6,28 +6,6 @@ WORKDIR /app
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     git \
-<<<<<<< Updated upstream
-    && rm -rf /var/lib/apt/lists/*
-
-# Clone Megatron-LM dmc branch and install
-RUN git clone -b dmc https://github.com/NVIDIA/Megatron-LM.git && \
-    cd Megatron-LM && \
-    pip install -e .
-
-# Copy loader files
-COPY . .
-
-# Install requirements
-RUN pip install --no-cache-dir -r dyana-requirements-gpu.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Environment setup
-ENV CUDA_DEVICE_MAX_CONNECTIONS=1
-ENV PYTHONUNBUFFERED=1
-ENV MEGATRON_DEBUG=1
-
-ENTRYPOINT ["python3", "-u", "main.py"]
-=======
     ca-certificates \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
@@ -46,6 +24,7 @@ ENV OMP_NUM_THREADS=1
 ENV NVTE_FRAMEWORK=pytorch
 ENV MAX_JOBS=4
 ENV DEBIAN_FRONTEND=noninteractive
+# Add these new environment variables
 ENV TORCH_CUDNN_V8_API_ENABLED=1
 ENV TORCH_ALLOW_TF32=1
 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
@@ -94,5 +73,4 @@ RUN chown -R root:root /app && \
     chmod +x /app/workspace/entrypoint.sh
 
 # Use full path in entrypoint
-ENTRYPOINT ["/app/workspace/entrypoint.sh"]
->>>>>>> Stashed changes
+ENTRYPOINT ["/app/workspace/entrypoint.sh"]
\ No newline at end of file
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 9e20978..5b9b998 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,170 +1,16 @@
-<<<<<<< Updated upstream
-import os
-import sys
-=======
 import argparse
 import os
->>>>>>> Stashed changes
 import torch
 from pathlib import Path
 from dyana import Profiler
 
-<<<<<<< Updated upstream
-from megatron.core import parallel_state
-from megatron.core.models.gpt import GPTModel
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
-from megatron.training import get_args, get_model
-from megatron.training.arguments import parse_args, core_transformer_config_from_args
-from megatron.training.initialize import initialize_megatron
-from megatron.training.checkpointing import load_checkpoint
-from megatron.contrib.dmc import add_dmc_layer
-
-
-def setup_megatron_args(model_size: str, model_path: str, tokenizer_path: str):
-    """Setup Megatron arguments"""
-    print("Debug: Starting argument setup")
-    sys.argv = [sys.argv[0]]
-
-    args = [
-        "--tensor-model-parallel-size",
-        "1",
-        "--pipeline-model-parallel-size",
-        "1",
-        "--load",
-        model_path,
-        "--tokenizer-model",
-        tokenizer_path,
-        "--tokenizer-type",
-        "Llama2Tokenizer",
-        "--bf16",
-        "--seq-length",
-        "4096",
-        "--max-position-embeddings",
-        "4096",
-        "--num-layers",
-        "32" if model_size == "7B" else "40",
-        "--hidden-size",
-        "4096" if model_size == "7B" else "5120",
-        "--num-attention-heads",
-        "32" if model_size == "7B" else "40",
-        "--micro-batch-size",
-        "1",
-        "--global-batch-size",
-        "1",
-        "--no-masked-softmax-fusion",
-        "--no-load-optim",
-        "--no-load-rng",
-        "--skip-train",
-        "--fp16",
-        "--use-cpu-initialization",  # avoid CUDA deadlocks
-        "--tokenizer-type",
-        "Llama2Tokenizer",
-    ]
-
-    print("Debug: Setting sys.argv")
-    sys.argv.extend(args)
-
-    print("Debug: Parsing args")
-    args = parse_args()
-
-    print("Debug: Initializing Megatron")
-    initialize_megatron(args_defaults={"no_load_optim": True, "no_load_rng": True})
-
-    return get_args()
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Model provider for Megatron to load the model."""
-    print("Debug: Setting up model provider")
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-
-    print("Debug: Creating model")
-    model = GPTModel(
-        config=config,
-        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
-        vocab_size=args.padded_vocab_size,
-        max_sequence_length=args.max_position_embeddings,
-        pre_process=pre_process,
-        post_process=post_process,
-    )
-
-    return model
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", required=True)
-    parser.add_argument("--tokenizer", required=True)
-    parser.add_argument("--size", choices=["7B", "13B"], required=True)
-    parser.add_argument("--input", default="This is an example prompt.")
-    args = parser.parse_args()
-
-    profiler = Profiler(gpu=True)
-
-    try:
-        # verify files
-        model_path = Path(args.model)
-        tokenizer_path = Path(args.tokenizer)
-        if not model_path.exists():
-            raise FileNotFoundError(f"Model not found at {model_path}")
-        if not tokenizer_path.exists():
-            raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
-
-        print("Debug: Starting initialization")
-        profiler.on_stage("initializing")
-
-        print("Debug: Setting up args")
-        args = setup_megatron_args(args.size, str(model_path), str(tokenizer_path))
-
-        print("Debug: Initializing model parallel")
-        torch.cuda.empty_cache()
-        parallel_state.set_tensor_model_parallel_world_size(1)
-        parallel_state.set_tensor_model_parallel_rank(0)
-
-        print("Debug: Creating model")
-        model = get_model(model_provider, wrap_with_ddp=False)
-
-        print("Debug: Loading checkpoint")
-        _ = load_checkpoint(model[0], None, None)
-        model = model[0].cuda()
-        model.eval()
-
-        print("Loading tokenizer...")
-        from transformers import LlamaTokenizer
-
-        tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path))
-
-        print("Starting inference...")
-        input_ids = tokenizer(args.input, return_tensors="pt").to("cuda")
-
-        with torch.no_grad():
-            output = model.generate(input_ids=input_ids["input_ids"], max_new_tokens=100, use_cache=True)
-            text = tokenizer.decode(output[0], skip_special_tokens=True)
-            profiler.track("output", text)
-            print(f"Generated text: {text}")
-
-        profiler.on_stage("complete")
-
-    except Exception as e:
-        print(f"Debug: Error occurred: {str(e)}")
-        print(f"Debug: Error type: {type(e)}")
-        import traceback
-
-        print(f"Debug: Traceback: {traceback.format_exc()}")
-        profiler.track_error("model", str(e))
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-=======
 
 def verify_cuda_setup():
     """Verify CUDA and PyTorch setup before model loading"""
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is not available")
 
-    # Disable JIT/Inductor
+    # Disable JIT/Inductor features
     torch._C._jit_override_can_fuse_on_cpu(False)
     torch._C._jit_override_can_fuse_on_gpu(False)
     torch._C._jit_set_texpr_fuser_enabled(False)
@@ -189,18 +35,19 @@ def verify_cuda_setup():
         verify_cuda_setup()
         profiler.on_stage("cuda_verified")
 
+        # Enable verbose logging and configure environment
         os.environ["TE_VERBOSE"] = "1"
         os.environ["NVTE_FRAMEWORK"] = "pytorch"
         print("Starting Megatron loader with verbose logging...")
 
-        # initialize CUDA and Transformer
+        # Initialize CUDA and Transformer Engine
         if torch.cuda.is_available():
             import transformer_engine.pytorch as te
 
             te.initialize()
             print(f"Initialized Transformer Engine version: {te.__version__}")
 
-        # import Megatron dependencies
+        # Now import Megatron dependencies
         from megatron.core import parallel_state
         from megatron.core.transformer.transformer_config import TransformerConfig
         from transformers import LlamaTokenizer
@@ -215,6 +62,7 @@ def verify_cuda_setup():
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         try:
+            # Print Megatron environment info
             print(f"Transformer Engine version: {transformer_engine.__version__}")
             print(f"CUDA devices: {torch.cuda.device_count()}")
             print(f"CUDA version: {torch.version.cuda}")
@@ -227,6 +75,7 @@ def verify_cuda_setup():
                 },
             )
 
+            # Verify files exist
             model_path = Path(args.model)
             tokenizer_path = Path(args.tokenizer)
             if not model_path.exists():
@@ -237,8 +86,8 @@ def verify_cuda_setup():
             # Initialize Megatron's tensor parallel
             world_size = torch.cuda.device_count()
             parallel_state.initialize_model_parallel(
-                tensor_model_parallel_size=1,
-                pipeline_model_parallel_size=1,
+                tensor_model_parallel_size=1,  # No tensor parallelism for now
+                pipeline_model_parallel_size=1,  # No pipeline parallelism
             )
             profiler.on_stage("megatron_initialized")
 
@@ -263,9 +112,11 @@ def verify_cuda_setup():
             profiler.on_stage("config_created")
 
             try:
+                # Load tokenizer
                 tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
                 profiler.on_stage("tokenizer_loaded")
 
+                # Initialize Megatron model
                 model = GPTModel(
                     config=config,
                     vocab_size=tokenizer.vocab_size,
@@ -282,8 +133,10 @@ def verify_cuda_setup():
                 model.eval()
                 profiler.on_stage("model_loaded")
 
+                # Run inference
                 input_ids = tokenizer(args.input, return_tensors="pt").to(device)
                 with torch.no_grad():
+                    # Megatron expects different input format
                     output = model(input_ids=input_ids["input_ids"])
                     logits = output.logits
                     next_token = torch.argmax(logits[:, -1, :], dim=-1)
@@ -314,9 +167,9 @@ def verify_cuda_setup():
         raise
 
     finally:
+        # Clean up Megatron's parallel state
         try:
             parallel_state.destroy_model_parallel()
         except Exception as e:
             profiler.track_error("cleanup", str(e))
             print(f"Cleanup error: {e}")
->>>>>>> Stashed changes
diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt
index 7bb4c24..fc435c0 100644
--- a/dyana/loaders/megatron/requirements.txt
+++ b/dyana/loaders/megatron/requirements.txt
@@ -1,33 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-<<<<<<< Updated upstream
-# Core dependencies
-torch>=2.1.0
-transformers>=4.31.0
-accelerate>=0.21.0
-psutil>=5.6.7
-
-# Megatron and model dependencies
-ninja
-sentencepiece==0.2.0
-tokenizers>=0.13.3
-transformer-engine>=1.3
-einops>=0.6.1
-evaluate
-scikit-learn
-flash-attn==2.6.1
-hydra_colorlog==1.2.0
-hydra-core==1.3.2
-nltk
-datasets
-
-# Dyana dependencies - using the base requirements
-rich>=10.0.0
-pydantic>=2.0.0
-pydantic-yaml>=1.0.0
-docker>=6.0.0
-psutil>=5.6.7
-nvidia-ml-py>=12.0.0
-=======
 --find-links https://developer.download.nvidia.com/compute/redist
 
 # Base dependencies from Megatron core
@@ -44,5 +15,4 @@ nltk
 datasets
 
 # Utilities
-psutil>=5.6.7
->>>>>>> Stashed changes
+psutil>=5.6.7
\ No newline at end of file
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
index 770a8d1..2aa2b75 100644
--- a/dyana/loaders/megatron/settings.yml
+++ b/dyana/loaders/megatron/settings.yml
@@ -26,8 +26,4 @@ args:
 
 examples:
   - description: "Load a Megatron-DMC model with tokenizer:"
-<<<<<<< Updated upstream
     command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B
-=======
-    command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B --verbose
->>>>>>> Stashed changes

From 5df5150b01415ac2f5e4c327ac60548de7a45f7b Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 3 Feb 2025 21:37:29 -0500
Subject: [PATCH 04/16] fix: lint errors

---
 dyana/loaders/megatron/Dockerfile |  1 -
 dyana/loaders/megatron/main.py    | 23 +++++++----------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index 80af966..bb582a6 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -24,7 +24,6 @@ ENV OMP_NUM_THREADS=1
 ENV NVTE_FRAMEWORK=pytorch
 ENV MAX_JOBS=4
 ENV DEBIAN_FRONTEND=noninteractive
-# Add these new environment variables
 ENV TORCH_CUDNN_V8_API_ENABLED=1
 ENV TORCH_ALLOW_TF32=1
 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 5b9b998..e52f5b1 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,7 +1,11 @@
 import argparse
 import os
-import torch
 from pathlib import Path
+
+import torch
+import transformer_engine as te
+from megatron.model.gpt_model import GPTModel
+
 from dyana import Profiler
 
 
@@ -22,12 +26,10 @@ def verify_cuda_setup():
     print(f"Device: {torch.cuda.get_device_name()}")
     print("===========================")
 
-    # Set default device
     torch.cuda.set_device(0)
 
 
 if __name__ == "__main__":
-    # Initialize profiler first
     profiler = Profiler(gpu=True)
 
     try:
@@ -35,7 +37,6 @@ def verify_cuda_setup():
         verify_cuda_setup()
         profiler.on_stage("cuda_verified")
 
-        # Enable verbose logging and configure environment
         os.environ["TE_VERBOSE"] = "1"
         os.environ["NVTE_FRAMEWORK"] = "pytorch"
         print("Starting Megatron loader with verbose logging...")
@@ -47,7 +48,6 @@ def verify_cuda_setup():
             te.initialize()
             print(f"Initialized Transformer Engine version: {te.__version__}")
 
-        # Now import Megatron dependencies
         from megatron.core import parallel_state
         from megatron.core.transformer.transformer_config import TransformerConfig
         from transformers import LlamaTokenizer
@@ -62,20 +62,18 @@ def verify_cuda_setup():
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         try:
-            # Print Megatron environment info
-            print(f"Transformer Engine version: {transformer_engine.__version__}")
+            print(f"Transformer Engine version: {te.__version__}")
             print(f"CUDA devices: {torch.cuda.device_count()}")
             print(f"CUDA version: {torch.version.cuda}")
             profiler.track(
                 "env_info",
                 {
-                    "te_version": transformer_engine.__version__,
+                    "te_version": te.__version__,
                     "cuda_devices": torch.cuda.device_count(),
                     "cuda_version": torch.version.cuda,
                 },
             )
 
-            # Verify files exist
             model_path = Path(args.model)
             tokenizer_path = Path(args.tokenizer)
             if not model_path.exists():
@@ -97,7 +95,6 @@ def verify_cuda_setup():
                 "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
             }[args.size]
 
-            # Create Megatron transformer config
             config = TransformerConfig(
                 num_layers=model_config["num_layers"],
                 hidden_size=model_config["hidden_size"],
@@ -112,11 +109,9 @@ def verify_cuda_setup():
             profiler.on_stage("config_created")
 
             try:
-                # Load tokenizer
                 tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
                 profiler.on_stage("tokenizer_loaded")
 
-                # Initialize Megatron model
                 model = GPTModel(
                     config=config,
                     vocab_size=tokenizer.vocab_size,
@@ -126,17 +121,14 @@ def verify_cuda_setup():
                 )
                 profiler.on_stage("model_created")
 
-                # Load DMC checkpoint
                 checkpoint = torch.load(str(model_path), map_location=device)
                 model.load_state_dict(checkpoint)
                 model.cuda()
                 model.eval()
                 profiler.on_stage("model_loaded")
 
-                # Run inference
                 input_ids = tokenizer(args.input, return_tensors="pt").to(device)
                 with torch.no_grad():
-                    # Megatron expects different input format
                     output = model(input_ids=input_ids["input_ids"])
                     logits = output.logits
                     next_token = torch.argmax(logits[:, -1, :], dim=-1)
@@ -167,7 +159,6 @@ def verify_cuda_setup():
         raise
 
     finally:
-        # Clean up Megatron's parallel state
         try:
             parallel_state.destroy_model_parallel()
         except Exception as e:

From 5885705a40afb260d311db5fb0af85d2dbdd6cfb Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Mon, 3 Feb 2025 21:41:10 -0500
Subject: [PATCH 05/16] fix: import path typechecks

---
 dyana/loaders/megatron/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index e52f5b1..f354e52 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -5,11 +5,10 @@
 import torch
 import transformer_engine as te
 from megatron.model.gpt_model import GPTModel
+from dyana.profiler import Profiler
 
-from dyana import Profiler
 
-
-def verify_cuda_setup():
+def verify_cuda_setup() -> None:
     """Verify CUDA and PyTorch setup before model loading"""
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA is not available")

From d67d6241eba6b97f7e1bea34e68235c8a2352318 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 08:02:20 -0500
Subject: [PATCH 06/16] fix: lint regression errors

---
 dyana/loaders/megatron/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index f354e52..7b5c710 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -5,6 +5,7 @@
 import torch
 import transformer_engine as te
 from megatron.model.gpt_model import GPTModel
+
 from dyana.profiler import Profiler
 
 

From 4f050e95bb8bdf6d019c11d33e841e9ecb083d3c Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:40:23 -0500
Subject: [PATCH 07/16] chore: reduce complexity in PyTorch JIT/inductor
 features and os error cleanup

---
 dyana/loaders/megatron/Dockerfile |  26 ++-
 dyana/loaders/megatron/main.py    | 327 +++++++++++++++++-------------
 2 files changed, 208 insertions(+), 145 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index bb582a6..f3ad5f9 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -17,7 +17,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 ENV CUDA_LAUNCH_BLOCKING=1
 ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
 ENV CUDA_MODULE_LOADING=LAZY
-ENV TORCH_USE_CUDA_DSA=1
+ENV TORCH_USE_CUDA_DSA=0
 ENV CUDA_DEVICE_MAX_CONNECTIONS=1
 ENV NCCL_ASYNC_ERROR_HANDLING=1
 ENV OMP_NUM_THREADS=1
@@ -28,8 +28,14 @@ ENV TORCH_CUDNN_V8_API_ENABLED=1
 ENV TORCH_ALLOW_TF32=1
 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
 ENV PYTORCH_JIT=0
-ENV TORCH_COMPILE_DEBUG=1
-ENV TORCH_INDUCTOR_VAR_NAMES=1
+ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1
+ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0
+ENV PYTHONFAULTHANDLER=1
+ENV PYTHONUNBUFFERED=1
+ENV NCCL_IB_DISABLE=1
+ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1
+ENV TORCH_SHOW_CPP_STACKTRACES=0
+ENV PYTHONWARNINGS=ignore
 
 # Only verify PyTorch during build
 RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
@@ -54,10 +60,15 @@ RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /
 
 ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH
 
-# Create simpler entrypoint script
+# Create directories for IPC
+RUN mkdir -p /dev/shm && \
+    mkdir -p /tmp/pytorch_extensions && \
+    chmod -R 777 /dev/shm /tmp/pytorch_extensions
+
+# Create simpler entrypoint script with proper environment
 RUN printf '#!/bin/bash\n\
+    # Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\
     export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
-    export PYTORCH_NO_CUDA_MEMORY_CACHING=1\n\
     exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
     chmod +x /app/workspace/entrypoint.sh
 
@@ -71,5 +82,6 @@ RUN chown -R root:root /app && \
     chmod -R 755 /app && \
     chmod +x /app/workspace/entrypoint.sh
 
-# Use full path in entrypoint
-ENTRYPOINT ["/app/workspace/entrypoint.sh"]
\ No newline at end of file
+# Use bash as entrypoint shell
+SHELL ["/bin/bash", "-c"]
+ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""]
\ No newline at end of file
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 7b5c710..943d015 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,78 +1,66 @@
-import argparse
 import os
+import sys
+import logging
+import warnings
+import argparse
 from pathlib import Path
+from io import StringIO
+import contextlib
+
+logging.basicConfig(level=logging.ERROR)
+warnings.filterwarnings("ignore", category=UserWarning)
+os.environ["PYTHONWARNINGS"] = "ignore"
+
+os.environ.update(
+    {
+        "CUDA_LAUNCH_BLOCKING": "1",
+        "PYTORCH_NO_CUDA_MEMORY_CACHING": "1",
+        "TORCH_USE_CUDA_DSA": "0",
+        "NVTE_FRAMEWORK": "pytorch",
+        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32",
+        "TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1",
+        "TORCH_INDUCTOR_USE_PYTHON_BINDING": "0",
+        "TORCH_SHOW_CPP_STACKTRACES": "0",
+    }
+)
 
 import torch
-import transformer_engine as te
-from megatron.model.gpt_model import GPTModel
-
-from dyana.profiler import Profiler
-
-
-def verify_cuda_setup() -> None:
-    """Verify CUDA and PyTorch setup before model loading"""
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA is not available")
-
-    # Disable JIT/Inductor features
-    torch._C._jit_override_can_fuse_on_cpu(False)
-    torch._C._jit_override_can_fuse_on_gpu(False)
-    torch._C._jit_set_texpr_fuser_enabled(False)
-    torch._C._jit_set_nvfuser_enabled(False)
-
-    print("=== Runtime Configuration ===")
-    print(f"PyTorch: {torch.__version__}")
-    print(f"CUDA: {torch.version.cuda}")
-    print(f"Device: {torch.cuda.get_device_name()}")
-    print("===========================")
-
-    torch.cuda.set_device(0)
 
+torch._C._jit_set_nvfuser_enabled(False)
+torch._C._jit_set_texpr_fuser_enabled(False)
+torch._C._jit_override_can_fuse_on_cpu(False)
+torch._C._jit_override_can_fuse_on_gpu(False)
 
 if __name__ == "__main__":
-    profiler = Profiler(gpu=True)
-
-    try:
-        # Verify CUDA setup
-        verify_cuda_setup()
-        profiler.on_stage("cuda_verified")
-
-        os.environ["TE_VERBOSE"] = "1"
-        os.environ["NVTE_FRAMEWORK"] = "pytorch"
-        print("Starting Megatron loader with verbose logging...")
-
-        # Initialize CUDA and Transformer Engine
-        if torch.cuda.is_available():
-            import transformer_engine.pytorch as te
-
-            te.initialize()
-            print(f"Initialized Transformer Engine version: {te.__version__}")
-
-        from megatron.core import parallel_state
-        from megatron.core.transformer.transformer_config import TransformerConfig
-        from transformers import LlamaTokenizer
+    captured_output = StringIO()
+    with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
+        try:
+            from dyana import Profiler
 
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--model", required=True)
-        parser.add_argument("--tokenizer", required=True)
-        parser.add_argument("--size", choices=["7B", "13B"], required=True)
-        parser.add_argument("--input", default="This is an example prompt.")
-        args = parser.parse_args()
+            profiler = Profiler(gpu=True)
 
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Initialize CUDA
+            if torch.cuda.is_available():
+                torch.cuda.init()
+                torch.cuda.set_device(0)
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+                profiler.track(
+                    "cuda_info",
+                    {
+                        "version": torch.version.cuda,
+                        "device": torch.cuda.get_device_name(),
+                        "device_count": torch.cuda.device_count(),
+                    },
+                )
+            profiler.on_stage("cuda_initialized")
 
-        try:
-            print(f"Transformer Engine version: {te.__version__}")
-            print(f"CUDA devices: {torch.cuda.device_count()}")
-            print(f"CUDA version: {torch.version.cuda}")
-            profiler.track(
-                "env_info",
-                {
-                    "te_version": te.__version__,
-                    "cuda_devices": torch.cuda.device_count(),
-                    "cuda_version": torch.version.cuda,
-                },
-            )
+            parser = argparse.ArgumentParser()
+            parser.add_argument("--model", required=True)
+            parser.add_argument("--tokenizer", required=True)
+            parser.add_argument("--size", choices=["7B", "13B"], required=True)
+            parser.add_argument("--input", default="This is an example prompt.")
+            args = parser.parse_args()
 
             model_path = Path(args.model)
             tokenizer_path = Path(args.tokenizer)
@@ -80,87 +68,150 @@ def verify_cuda_setup() -> None:
                 raise FileNotFoundError(f"Model not found at {model_path}")
             if not tokenizer_path.exists():
                 raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+            profiler.on_stage("args_verified")
 
-            # Initialize Megatron's tensor parallel
-            world_size = torch.cuda.device_count()
-            parallel_state.initialize_model_parallel(
-                tensor_model_parallel_size=1,  # No tensor parallelism for now
-                pipeline_model_parallel_size=1,  # No pipeline parallelism
-            )
-            profiler.on_stage("megatron_initialized")
-
-            # Model config based on size
-            model_config = {
-                "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
-                "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
-            }[args.size]
-
-            config = TransformerConfig(
-                num_layers=model_config["num_layers"],
-                hidden_size=model_config["hidden_size"],
-                num_attention_heads=model_config["num_attention_heads"],
-                max_position_embeddings=4096,
-                init_method_std=0.02,
-                use_scaled_init_method=True,
-                attention_softmax_in_fp32=True,
-                rotary_pct=0.25,  # LLaMA uses rotary embeddings
-            )
-            profiler.track("model_config", model_config)
-            profiler.on_stage("config_created")
+            from transformers import LlamaTokenizer
+            from megatron.core import parallel_state
+            from megatron.core.transformer.transformer_config import TransformerConfig
+
+            # Initialize profiler first
+            initialized_parallel = False
 
             try:
-                tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
-                profiler.on_stage("tokenizer_loaded")
-
-                model = GPTModel(
-                    config=config,
-                    vocab_size=tokenizer.vocab_size,
-                    max_sequence_length=4096,
-                    parallel_output=False,
-                    share_embeddings_and_output_weights=True,
-                )
-                profiler.on_stage("model_created")
-
-                checkpoint = torch.load(str(model_path), map_location=device)
-                model.load_state_dict(checkpoint)
-                model.cuda()
-                model.eval()
-                profiler.on_stage("model_loaded")
-
-                input_ids = tokenizer(args.input, return_tensors="pt").to(device)
-                with torch.no_grad():
-                    output = model(input_ids=input_ids["input_ids"])
-                    logits = output.logits
-                    next_token = torch.argmax(logits[:, -1, :], dim=-1)
-                    generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
-                    text = tokenizer.decode(generated[0], skip_special_tokens=True)
-                    profiler.track("output", text)
-                    profiler.on_stage("inference_complete")
+                # Use fork multiprocessing
+                if sys.platform == "linux":
+                    import torch.multiprocessing as mp
+
+                    mp.set_start_method("fork", force=True)
+
+                if torch.cuda.is_available():
+                    print("=== Runtime Configuration ===")
+                    print(f"PyTorch: {torch.__version__}")
+                    print(f"CUDA: {torch.version.cuda}")
+                    print(f"Device: {torch.cuda.get_device_name()}")
+                    print("===========================")
+                    profiler.on_stage("cuda_verified")
+
+                if torch.cuda.is_available():
+                    import transformer_engine.pytorch as te
+
+                    try:
+                        te.initialize()
+                        print(f"Initialized Transformer Engine version: {te.__version__}")
+                    except Exception as e:
+                        print(f"Warning: Transformer Engine initialization failed: {e}")
+
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+                try:
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")
+                    print(f"CUDA devices: {torch.cuda.device_count()}")
+                    print(f"CUDA version: {torch.version.cuda}")
+                    profiler.track(
+                        "env_info",
+                        {
+                            "te_version": transformer_engine.__version__,
+                            "cuda_devices": torch.cuda.device_count(),
+                            "cuda_version": torch.version.cuda,
+                        },
+                    )
+
+                    # Megatron's tensor parallel
+                    world_size = torch.cuda.device_count()
+                    parallel_state.initialize_model_parallel(
+                        tensor_model_parallel_size=1,  # No tensor parallelism for now
+                        pipeline_model_parallel_size=1,  # No pipeline parallelism
+                    )
+                    profiler.on_stage("megatron_initialized")
+
+                    # parallel state initialization
+                    initialized_parallel = True
+
+                    # Model config
+                    model_config = {
+                        "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
+                        "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
+                    }[args.size]
+
+                    # Megatron transformer config
+                    config = TransformerConfig(
+                        num_layers=model_config["num_layers"],
+                        hidden_size=model_config["hidden_size"],
+                        num_attention_heads=model_config["num_attention_heads"],
+                        max_position_embeddings=4096,
+                        init_method_std=0.02,
+                        use_scaled_init_method=True,
+                        attention_softmax_in_fp32=True,
+                        rotary_pct=0.25,  # LLaMA uses rotary embeddings
+                    )
+                    profiler.track("model_config", model_config)
+                    profiler.on_stage("config_created")
+
+                    try:
+                        tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
+                        profiler.on_stage("tokenizer_loaded")
+
+                        model = GPTModel(
+                            config=config,
+                            vocab_size=tokenizer.vocab_size,
+                            max_sequence_length=4096,
+                            parallel_output=False,
+                            share_embeddings_and_output_weights=True,
+                        )
+                        profiler.on_stage("model_created")
+
+                        # Load DMC checkpoint
+                        checkpoint = torch.load(str(model_path), map_location=device)
+                        model.load_state_dict(checkpoint)
+                        model.cuda()
+                        model.eval()
+                        profiler.on_stage("model_loaded")
+
+                        # Run inference
+                        input_ids = tokenizer(args.input, return_tensors="pt").to(device)
+                        with torch.no_grad():
+                            output = model(input_ids=input_ids["input_ids"])
+                            logits = output.logits
+                            next_token = torch.argmax(logits[:, -1, :], dim=-1)
+                            generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
+                            text = tokenizer.decode(generated[0], skip_special_tokens=True)
+                            profiler.track("output", text)
+                            profiler.on_stage("inference_complete")
+
+                    except Exception as e:
+                        profiler.track_error("model", str(e))
+                        print(f"Model loading/inference failed: {e}")
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                        raise
+
+                except Exception as e:
+                    print(f"Error occurred: {str(e)}")
+                    profiler.track_error("model", str(e))
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    raise
 
             except Exception as e:
-                profiler.track_error("model", str(e))
-                print(f"Model loading/inference failed: {e}")
+                profiler.track_error("setup", str(e))
+                print(f"Setup error: {e}")
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 raise
 
-        except Exception as e:
-            print(f"Error occurred: {str(e)}")
-            profiler.track_error("model", str(e))
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            raise
+            finally:
+                # Clean up Megatron's parallel state only if it was initialized
+                try:
+                    if initialized_parallel:
+                        parallel_state.destroy_model_parallel()
+                except Exception as e:
+                    profiler.track_error("cleanup", str(e))
+                    print(f"Cleanup error: {e}")
 
-    except Exception as e:
-        profiler.track_error("setup", str(e))
-        print(f"Setup error: {e}")
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        raise
-
-    finally:
-        try:
-            parallel_state.destroy_model_parallel()
         except Exception as e:
-            profiler.track_error("cleanup", str(e))
-            print(f"Cleanup error: {e}")
+            profiler.track_error("runtime", str(e))
+            print(f"Error: {e}", file=sys.stderr)
+            raise
+        finally:
+            profiler.flush()
+            print(captured_output.getvalue(), file=sys.stderr)

From ae99bbac98d8a349f0dff214599e5eb87598b72b Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:47:20 -0500
Subject: [PATCH 08/16] fix: lint typechecks

---
 dyana/loaders/megatron/main.py | 36 ++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 943d015..aae02c7 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,16 +1,27 @@
+import argparse
+import contextlib
+import logging
 import os
 import sys
-import logging
 import warnings
-import argparse
-from pathlib import Path
 from io import StringIO
-import contextlib
+from pathlib import Path
 
+import torch
+import transformer_engine as te
+from megatron.core import parallel_state
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.model.gpt_model import GPTModel
+from transformers import LlamaTokenizer
+
+from dyana.profiler import Profiler  # Update this import path based on your project structure
+
+# Configure logging and warnings
 logging.basicConfig(level=logging.ERROR)
 warnings.filterwarnings("ignore", category=UserWarning)
 os.environ["PYTHONWARNINGS"] = "ignore"
 
+# Configure environment variables
 os.environ.update(
     {
         "CUDA_LAUNCH_BLOCKING": "1",
@@ -24,8 +35,7 @@
     }
 )
 
-import torch
-
+# Configure PyTorch
 torch._C._jit_set_nvfuser_enabled(False)
 torch._C._jit_set_texpr_fuser_enabled(False)
 torch._C._jit_override_can_fuse_on_cpu(False)
@@ -35,13 +45,11 @@
     captured_output = StringIO()
     with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
         try:
-            from dyana import Profiler
-
             profiler = Profiler(gpu=True)
 
             # Initialize CUDA
             if torch.cuda.is_available():
-                torch.cuda.init()
+                torch.cuda.init()  # type: ignore
                 torch.cuda.set_device(0)
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.backends.cudnn.allow_tf32 = True
@@ -70,10 +78,6 @@
                 raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
             profiler.on_stage("args_verified")
 
-            from transformers import LlamaTokenizer
-            from megatron.core import parallel_state
-            from megatron.core.transformer.transformer_config import TransformerConfig
-
             # Initialize profiler first
             initialized_parallel = False
 
@@ -93,8 +97,6 @@
                     profiler.on_stage("cuda_verified")
 
                 if torch.cuda.is_available():
-                    import transformer_engine.pytorch as te
-
                     try:
                         te.initialize()
                         print(f"Initialized Transformer Engine version: {te.__version__}")
@@ -104,13 +106,13 @@
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {transformer_engine.__version__}")
+                    print(f"Transformer Engine version: {te.__version__}")
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": transformer_engine.__version__,
+                            "te_version": te.__version__,
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },

From cdb89715eb22da69361ae910ce506943af4614e1 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:50:13 -0500
Subject: [PATCH 09/16] fix: revert to working code with lint typecheck errs

---
 dyana/loaders/megatron/main.py | 36 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index aae02c7..943d015 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,27 +1,16 @@
-import argparse
-import contextlib
-import logging
 import os
 import sys
+import logging
 import warnings
-from io import StringIO
+import argparse
 from pathlib import Path
+from io import StringIO
+import contextlib
 
-import torch
-import transformer_engine as te
-from megatron.core import parallel_state
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.model.gpt_model import GPTModel
-from transformers import LlamaTokenizer
-
-from dyana.profiler import Profiler  # Update this import path based on your project structure
-
-# Configure logging and warnings
 logging.basicConfig(level=logging.ERROR)
 warnings.filterwarnings("ignore", category=UserWarning)
 os.environ["PYTHONWARNINGS"] = "ignore"
 
-# Configure environment variables
 os.environ.update(
     {
         "CUDA_LAUNCH_BLOCKING": "1",
@@ -35,7 +24,8 @@
     }
 )
 
-# Configure PyTorch
+import torch
+
 torch._C._jit_set_nvfuser_enabled(False)
 torch._C._jit_set_texpr_fuser_enabled(False)
 torch._C._jit_override_can_fuse_on_cpu(False)
@@ -45,11 +35,13 @@
     captured_output = StringIO()
     with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
         try:
+            from dyana import Profiler
+
             profiler = Profiler(gpu=True)
 
             # Initialize CUDA
             if torch.cuda.is_available():
-                torch.cuda.init()  # type: ignore
+                torch.cuda.init()
                 torch.cuda.set_device(0)
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.backends.cudnn.allow_tf32 = True
@@ -78,6 +70,10 @@
                 raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
             profiler.on_stage("args_verified")
 
+            from transformers import LlamaTokenizer
+            from megatron.core import parallel_state
+            from megatron.core.transformer.transformer_config import TransformerConfig
+
             # Initialize profiler first
             initialized_parallel = False
 
@@ -97,6 +93,8 @@
                     profiler.on_stage("cuda_verified")
 
                 if torch.cuda.is_available():
+                    import transformer_engine.pytorch as te
+
                     try:
                         te.initialize()
                         print(f"Initialized Transformer Engine version: {te.__version__}")
@@ -106,13 +104,13 @@
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {te.__version__}")
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": te.__version__,
+                            "te_version": transformer_engine.__version__,
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },

From 2316938f71767ddc1b31628e7ea01fd5e1339c33 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 12:51:08 -0500
Subject: [PATCH 10/16] chore: add ci check exceptions

---
 dyana/loaders/megatron/main.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 943d015..441bd63 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,3 +1,5 @@
+# ruff: noqa: I001, E402
+# type: ignore
 import os
 import sys
 import logging
@@ -24,7 +26,7 @@
     }
 )
 
-import torch
+import torch  # noqa: E402
 
 torch._C._jit_set_nvfuser_enabled(False)
 torch._C._jit_set_texpr_fuser_enabled(False)
@@ -41,7 +43,7 @@
 
             # Initialize CUDA
             if torch.cuda.is_available():
-                torch.cuda.init()
+                torch.cuda.init()  # type: ignore[no-untyped-call]
                 torch.cuda.set_device(0)
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.backends.cudnn.allow_tf32 = True
@@ -97,20 +99,20 @@
 
                     try:
                         te.initialize()
-                        print(f"Initialized Transformer Engine version: {te.__version__}")
+                        print(f"Initialized Transformer Engine version: {te.__version__}")  # noqa: F821
                     except Exception as e:
                         print(f"Warning: Transformer Engine initialization failed: {e}")
 
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {transformer_engine.__version__}")
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")  # noqa: F821
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": transformer_engine.__version__,
+                            "te_version": transformer_engine.__version__,  # noqa: F821
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },
@@ -151,7 +153,7 @@
                         tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
                         profiler.on_stage("tokenizer_loaded")
 
-                        model = GPTModel(
+                        model = GPTModel(  # noqa: F821
                             config=config,
                             vocab_size=tokenizer.vocab_size,
                             max_sequence_length=4096,

From dc9a344f08aabc1edcb06475e6badbae74859d9e Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 16:45:14 -0500
Subject: [PATCH 11/16] fix: gpu fixes

---
 dyana/loaders/megatron/Dockerfile | 25 ++++-----
 dyana/loaders/megatron/main.py    | 84 ++++++++++++++-----------------
 2 files changed, 50 insertions(+), 59 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index f3ad5f9..45f48db 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -26,18 +26,15 @@ ENV MAX_JOBS=4
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TORCH_CUDNN_V8_API_ENABLED=1
 ENV TORCH_ALLOW_TF32=1
-ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
-ENV PYTORCH_JIT=0
-ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1
-ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0
-ENV PYTHONFAULTHANDLER=1
-ENV PYTHONUNBUFFERED=1
-ENV NCCL_IB_DISABLE=1
-ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1
 ENV TORCH_SHOW_CPP_STACKTRACES=0
 ENV PYTHONWARNINGS=ignore
+ENV NVIDIA_VISIBLE_DEVICES="all"
+ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
+ENV TORCH_USE_CUDA_DSA=1
+ENV PYTORCH_JIT=0
+ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=0
 
-# Only verify PyTorch during build
+# Only verify PyTorch version during build (not CUDA)
 RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
 
 # Create working directory
@@ -65,14 +62,14 @@ RUN mkdir -p /dev/shm && \
     mkdir -p /tmp/pytorch_extensions && \
     chmod -R 777 /dev/shm /tmp/pytorch_extensions
 
-# Create simpler entrypoint script with proper environment
+# Create entrypoint script
 RUN printf '#!/bin/bash\n\
-    # Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\
-    export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
-    exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
+    python3 -c "import torch; assert torch.cuda.is_available(), \"CUDA is not available\"; device=torch.cuda.get_device_name(); print(f\"CUDA OK: {device}\")" && \
+    export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH && \
+    exec python3 -W ignore main.py "$@"' > /app/workspace/entrypoint.sh && \
     chmod +x /app/workspace/entrypoint.sh
 
-# Verify files exist and have correct permissions
+# Verify files exist and perms
 RUN ls -la /app/workspace && \
     ls -la /app/workspace/entrypoint.sh && \
     test -x /app/workspace/entrypoint.sh
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 441bd63..1abc1d8 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,5 +1,3 @@
-# ruff: noqa: I001, E402
-# type: ignore
 import os
 import sys
 import logging
@@ -11,27 +9,15 @@
 
 logging.basicConfig(level=logging.ERROR)
 warnings.filterwarnings("ignore", category=UserWarning)
-os.environ["PYTHONWARNINGS"] = "ignore"
-
-os.environ.update(
-    {
-        "CUDA_LAUNCH_BLOCKING": "1",
-        "PYTORCH_NO_CUDA_MEMORY_CACHING": "1",
-        "TORCH_USE_CUDA_DSA": "0",
-        "NVTE_FRAMEWORK": "pytorch",
-        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32",
-        "TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1",
-        "TORCH_INDUCTOR_USE_PYTHON_BINDING": "0",
-        "TORCH_SHOW_CPP_STACKTRACES": "0",
-    }
-)
-
-import torch  # noqa: E402
-
-torch._C._jit_set_nvfuser_enabled(False)
-torch._C._jit_set_texpr_fuser_enabled(False)
-torch._C._jit_override_can_fuse_on_cpu(False)
-torch._C._jit_override_can_fuse_on_gpu(False)
+
+# Import torch and configure CUDA
+import torch
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+if torch.cuda.is_available():
+    torch.cuda.init()
+    torch.cuda.set_device(0)
 
 if __name__ == "__main__":
     captured_output = StringIO()
@@ -41,20 +27,28 @@
 
             profiler = Profiler(gpu=True)
 
-            # Initialize CUDA
-            if torch.cuda.is_available():
-                torch.cuda.init()  # type: ignore[no-untyped-call]
-                torch.cuda.set_device(0)
-                torch.backends.cuda.matmul.allow_tf32 = True
-                torch.backends.cudnn.allow_tf32 = True
-                profiler.track(
-                    "cuda_info",
-                    {
-                        "version": torch.version.cuda,
-                        "device": torch.cuda.get_device_name(),
-                        "device_count": torch.cuda.device_count(),
-                    },
-                )
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is not available but required")
+
+            # Force CUDA initialization
+            torch.cuda.init()
+            torch.cuda.set_device(0)
+            # Allocate a small tensor to ensure CUDA is working
+            test_tensor = torch.zeros(1, device="cuda")
+            del test_tensor
+            torch.cuda.empty_cache()
+
+            device_name = torch.cuda.get_device_name()
+            device_count = torch.cuda.device_count()
+            cuda_version = torch.version.cuda
+            gpu_mem = torch.cuda.get_device_properties(0).total_memory
+            print(
+                f"Found {device_count} CUDA devices, using {device_name} with {gpu_mem / 1e9:.1f}GB memory",
+                file=sys.stderr,
+            )
+            profiler.track(
+                "gpu_info", {"device": device_name, "count": device_count, "cuda": cuda_version, "memory": gpu_mem}
+            )
             profiler.on_stage("cuda_initialized")
 
             parser = argparse.ArgumentParser()
@@ -99,20 +93,20 @@
 
                     try:
                         te.initialize()
-                        print(f"Initialized Transformer Engine version: {te.__version__}")  # noqa: F821
+                        print(f"Initialized Transformer Engine version: {te.__version__}")
                     except Exception as e:
                         print(f"Warning: Transformer Engine initialization failed: {e}")
 
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {transformer_engine.__version__}")  # noqa: F821
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": transformer_engine.__version__,  # noqa: F821
+                            "te_version": transformer_engine.__version__,
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },
@@ -153,20 +147,20 @@
                         tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
                         profiler.on_stage("tokenizer_loaded")
 
-                        model = GPTModel(  # noqa: F821
+                        model = GPTModel(
                             config=config,
                             vocab_size=tokenizer.vocab_size,
                             max_sequence_length=4096,
                             parallel_output=False,
                             share_embeddings_and_output_weights=True,
-                        )
+                        ).cuda()  # GPU
                         profiler.on_stage("model_created")
 
-                        # Load DMC checkpoint
-                        checkpoint = torch.load(str(model_path), map_location=device)
+                        # Load DMC checkpoint directly to GPU
+                        checkpoint = torch.load(str(model_path), map_location="cuda")
                         model.load_state_dict(checkpoint)
-                        model.cuda()
                         model.eval()
+                        torch.cuda.synchronize()  # Ensure model is loaded to GPU
                         profiler.on_stage("model_loaded")
 
                         # Run inference

From 9a3f0086b236a91b460b63df348fb3f1fdd91aac Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Tue, 4 Feb 2025 16:58:57 -0500
Subject: [PATCH 12/16] chore: avoid lint n typechecks

---
 dyana/loaders/megatron/main.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 1abc1d8..616a3c4 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,3 +1,5 @@
+# ruff: noqa: I001, E402, F401, F821
+# type: ignore
 import os
 import sys
 import logging
@@ -11,12 +13,12 @@
 warnings.filterwarnings("ignore", category=UserWarning)
 
 # Import torch and configure CUDA
-import torch
+import torch  # noqa: E402
 
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 if torch.cuda.is_available():
-    torch.cuda.init()
+    torch.cuda.init()  # type: ignore[no-untyped-call]
     torch.cuda.set_device(0)
 
 if __name__ == "__main__":
@@ -31,7 +33,7 @@
                 raise RuntimeError("CUDA is not available but required")
 
             # Force CUDA initialization
-            torch.cuda.init()
+            torch.cuda.init()  # type: ignore[no-untyped-call]
             torch.cuda.set_device(0)
             # Allocate a small tensor to ensure CUDA is working
             test_tensor = torch.zeros(1, device="cuda")
@@ -93,20 +95,20 @@
 
                     try:
                         te.initialize()
-                        print(f"Initialized Transformer Engine version: {te.__version__}")
+                        print(f"Initialized Transformer Engine version: {te.__version__}")  # noqa: F821
                     except Exception as e:
                         print(f"Warning: Transformer Engine initialization failed: {e}")
 
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {transformer_engine.__version__}")
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")  # noqa: F821
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": transformer_engine.__version__,
+                            "te_version": transformer_engine.__version__,  # noqa: F821
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },
@@ -147,7 +149,7 @@
                         tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
                         profiler.on_stage("tokenizer_loaded")
 
-                        model = GPTModel(
+                        model = GPTModel(  # noqa: F821
                             config=config,
                             vocab_size=tokenizer.vocab_size,
                             max_sequence_length=4096,

From cf9e0f9c770a229bb76e70fb4f9b35b00222a7f3 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Wed, 5 Feb 2025 15:30:14 -0500
Subject: [PATCH 13/16] chore: rm explicit tokenizer param

---
 dyana/loaders/megatron/Dockerfile       |  36 ++----
 dyana/loaders/megatron/main.py          | 159 +++++++++++++++++++++---
 dyana/loaders/megatron/requirements.txt |   1 +
 dyana/loaders/megatron/settings.yml     |  11 +-
 4 files changed, 159 insertions(+), 48 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index 45f48db..80384b9 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -10,29 +10,19 @@ RUN apt-get update && \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 
-# Configure environment
-ENV CUDA_HOME=/usr/local/cuda
-ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-ENV CUDA_LAUNCH_BLOCKING=1
-ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
-ENV CUDA_MODULE_LOADING=LAZY
-ENV TORCH_USE_CUDA_DSA=0
-ENV CUDA_DEVICE_MAX_CONNECTIONS=1
-ENV NCCL_ASYNC_ERROR_HANDLING=1
-ENV OMP_NUM_THREADS=1
-ENV NVTE_FRAMEWORK=pytorch
-ENV MAX_JOBS=4
-ENV DEBIAN_FRONTEND=noninteractive
-ENV TORCH_CUDNN_V8_API_ENABLED=1
-ENV TORCH_ALLOW_TF32=1
-ENV TORCH_SHOW_CPP_STACKTRACES=0
-ENV PYTHONWARNINGS=ignore
-ENV NVIDIA_VISIBLE_DEVICES="all"
-ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
-ENV TORCH_USE_CUDA_DSA=1
-ENV PYTORCH_JIT=0
-ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=0
+# Create required directories for multiprocessing
+RUN mkdir -p /dev/shm && \
+    mkdir -p /tmp/pytorch_extensions && \
+    mkdir -p /run/shm && \
+    chmod -R 777 /dev/shm /tmp/pytorch_extensions /run/shm
+
+# Create ALL required directories for IPC and shared memory
+RUN mkdir -p /dev/shm && \
+    mkdir -p /run/shm && \
+    mkdir -p /tmp/pytorch_extensions && \
+    mkdir -p /tmp/.pytorch_jit_cache && \
+    mkdir -p /tmp/transformers && \
+    chmod -R 777 /dev/shm /run/shm /tmp/pytorch_extensions /tmp/.pytorch_jit_cache /tmp/transformers
 
 # Only verify PyTorch version during build (not CUDA)
 RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 616a3c4..e45d3ad 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,4 +1,4 @@
-# ruff: noqa: I001, E402, F401, F821
+# ruff: noqa: I001, F401, E402, B904, F821
 # type: ignore
 import os
 import sys
@@ -13,33 +13,107 @@
 warnings.filterwarnings("ignore", category=UserWarning)
 
 # Import torch and configure CUDA
-import torch  # noqa: E402
+import torch
 
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 if torch.cuda.is_available():
-    torch.cuda.init()  # type: ignore[no-untyped-call]
+    torch.cuda.init()
     torch.cuda.set_device(0)
 
+
+def find_tokenizer(model_path: Path) -> Path:
+    """Find tokenizer file in model directory or alongside model file."""
+    patterns = [
+        # LLaMA specific patterns first
+        "llama*tokenizer*.model",  # LLaMA specific naming
+        "tokenizer.model",  # Standard LLaMA tokenizer
+        # Generic patterns as fallback
+        "*.model",  # sentencepiece models
+        "tokenizer.*",  # huggingface style
+        "*/tokenizer.*",  # nested folder
+        "vocab.*",  # vocabulary files
+        "merges.txt",  # BPE merges
+    ]
+
+    # Try both the model's directory and its parent directory
+    search_dirs = [model_path.parent]
+    if model_path.parent.parent.exists():
+        search_dirs.append(model_path.parent.parent)
+
+    print("\n=== Tokenizer Search ===", file=sys.stderr)
+
+    for directory in search_dirs:
+        print(f"Looking in: {directory}", file=sys.stderr)
+        print("Directory contents:", file=sys.stderr)
+        all_files = list(directory.glob("*"))
+        for f in sorted(all_files):
+            print(f"  {f}", file=sys.stderr)
+            # If it looks like a LLaMA tokenizer file, try it first
+            if "tokenizer" in f.name.lower() and f.name.endswith(".model"):
+                print(f"Found likely LLaMA tokenizer: {f}", file=sys.stderr)
+                return f
+
+        # If no obvious tokenizer found, try the patterns
+        print("\nTrying patterns:", file=sys.stderr)
+        for pattern in patterns:
+            print(f"  {pattern}...", file=sys.stderr, end=" ")
+            matches = list(directory.glob(pattern))
+            if matches:
+                print(f"Found: {matches[0]}", file=sys.stderr)
+                return matches[0]
+            print("No match", file=sys.stderr)
+
+    raise FileNotFoundError(
+        f"No tokenizer found in {[str(d) for d in search_dirs]} after trying patterns: {patterns}\n"
+        f"Available files in {model_path.parent}: {[f.name for f in model_path.parent.glob('*')]}"
+    )
+
+
 if __name__ == "__main__":
+    # Set multiprocessing start method
+    import multiprocessing
+
+    multiprocessing.set_start_method("spawn", force=True)
+
     captured_output = StringIO()
     with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
         try:
+            print("=== Starting Megatron Loader ===", file=sys.stderr)
             from dyana import Profiler
 
+            # Initialize CUDA
+            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+            os.environ["TORCH_USE_CUDA_DSA"] = "0"
+            os.environ["PYTORCH_JIT"] = "0"  # Disable JIT at env level
+            os.environ["TORCH_USE_RTLD_GLOBAL"] = "1"
+            os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1"  # Disable CUDA graphs
+
+            if not os.path.exists("/dev/shm"):
+                print("Warning: /dev/shm not found, creating...", file=sys.stderr)
+                os.makedirs("/dev/shm", exist_ok=True)
+
+            # PyTorch before other imports
+            print("=== Configuring PyTorch ===", file=sys.stderr)
+            # Disable JIT compilation using available methods
+            if hasattr(torch._C, "_jit_set_profiling_mode"):
+                torch._C._jit_set_profiling_mode(False)
+                print("✓ Disabled JIT profiling mode", file=sys.stderr)
+
             profiler = Profiler(gpu=True)
 
             if not torch.cuda.is_available():
                 raise RuntimeError("CUDA is not available but required")
 
             # Force CUDA initialization
-            torch.cuda.init()  # type: ignore[no-untyped-call]
+            torch.cuda.init()
             torch.cuda.set_device(0)
             # Allocate a small tensor to ensure CUDA is working
             test_tensor = torch.zeros(1, device="cuda")
             del test_tensor
             torch.cuda.empty_cache()
 
+            # GPU info
             device_name = torch.cuda.get_device_name()
             device_count = torch.cuda.device_count()
             cuda_version = torch.version.cuda
@@ -53,24 +127,71 @@
             )
             profiler.on_stage("cuda_initialized")
 
+            print("\n=== Importing Dependencies ===", file=sys.stderr)
+            try:
+                from transformers import LlamaTokenizer
+
+                print("✓ Imported LlamaTokenizer", file=sys.stderr)
+                from megatron.core import parallel_state
+
+                print("✓ Imported parallel_state", file=sys.stderr)
+                from megatron.core.transformer.transformer_config import TransformerConfig
+
+                print("✓ Imported TransformerConfig", file=sys.stderr)
+            except Exception as e:
+                print(f"Failed to import dependencies: {e}", file=sys.stderr)
+                profiler.track_error("imports", str(e))
+                raise
+
+            print("\n=== Parsing Arguments ===", file=sys.stderr)
             parser = argparse.ArgumentParser()
             parser.add_argument("--model", required=True)
-            parser.add_argument("--tokenizer", required=True)
             parser.add_argument("--size", choices=["7B", "13B"], required=True)
             parser.add_argument("--input", default="This is an example prompt.")
+            parser.add_argument("--tokenizer", help="Optional explicit tokenizer path")
             args = parser.parse_args()
 
             model_path = Path(args.model)
-            tokenizer_path = Path(args.tokenizer)
             if not model_path.exists():
                 raise FileNotFoundError(f"Model not found at {model_path}")
-            if not tokenizer_path.exists():
-                raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
-            profiler.on_stage("args_verified")
 
-            from transformers import LlamaTokenizer
-            from megatron.core import parallel_state
-            from megatron.core.transformer.transformer_config import TransformerConfig
+            print("\n=== Checking Files ===", file=sys.stderr)
+            print(f"Model path: {model_path}", file=sys.stderr)
+            print("Directory contents:", file=sys.stderr)
+            for f in sorted(model_path.parent.glob("*")):
+                print(f"  {f}", file=sys.stderr)
+
+            # Try explicit tokenizer path
+            if args.tokenizer:
+                tokenizer_path = Path(args.tokenizer)
+                if not tokenizer_path.exists():
+                    raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+                print(f"Using provided tokenizer: {tokenizer_path}", file=sys.stderr)
+            else:
+                # Otherwise search for tokenizer
+                tokenizer_path = find_tokenizer(model_path)
+                print(f"Found tokenizer: {tokenizer_path}", file=sys.stderr)
+
+            try:
+                print("\n=== Loading Tokenizer ===", file=sys.stderr)
+                print(f"Loading from: {tokenizer_path}", file=sys.stderr)
+
+                try:
+                    tokenizer = LlamaTokenizer.from_pretrained(
+                        str(tokenizer_path.parent),
+                        local_files_only=True,
+                        tokenizer_file=str(tokenizer_path.name),
+                    )
+                    print(f"Successfully loaded tokenizer (vocab_size={tokenizer.vocab_size})", file=sys.stderr)
+                except Exception as e:
+                    print(f"Failed to load tokenizer from {tokenizer_path}: {e}", file=sys.stderr)
+                    raise
+                print("=======================\n", file=sys.stderr)
+                profiler.on_stage("tokenizer_loaded")
+            except Exception as e:
+                print(f"Error loading tokenizer: {e}", file=sys.stderr)
+                profiler.track_error("tokenizer", str(e))
+                raise
 
             # Initialize profiler first
             initialized_parallel = False
@@ -95,20 +216,20 @@
 
                     try:
                         te.initialize()
-                        print(f"Initialized Transformer Engine version: {te.__version__}")  # noqa: F821
+                        print(f"Initialized Transformer Engine version: {te.__version__}")
                     except Exception as e:
                         print(f"Warning: Transformer Engine initialization failed: {e}")
 
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
                 try:
-                    print(f"Transformer Engine version: {transformer_engine.__version__}")  # noqa: F821
+                    print(f"Transformer Engine version: {te.__version__}")  # noqa: F821
                     print(f"CUDA devices: {torch.cuda.device_count()}")
                     print(f"CUDA version: {torch.version.cuda}")
                     profiler.track(
                         "env_info",
                         {
-                            "te_version": transformer_engine.__version__,  # noqa: F821
+                            "te_version": te.__version__,  # noqa: F821
                             "cuda_devices": torch.cuda.device_count(),
                             "cuda_version": torch.version.cuda,
                         },
@@ -146,7 +267,12 @@
                     profiler.on_stage("config_created")
 
                     try:
+                        # Load tokenizer
+                        print("\n=== Loading Tokenizer ===", file=sys.stderr)
+                        print(f"Loading from: {tokenizer_path}", file=sys.stderr)
                         tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
+                        print(f"Loaded tokenizer with vocab size: {tokenizer.vocab_size}", file=sys.stderr)
+                        print("=======================\n", file=sys.stderr)
                         profiler.on_stage("tokenizer_loaded")
 
                         model = GPTModel(  # noqa: F821
@@ -155,7 +281,7 @@
                             max_sequence_length=4096,
                             parallel_output=False,
                             share_embeddings_and_output_weights=True,
-                        ).cuda()  # GPU
+                        ).cuda()  # Explicit GPU
                         profiler.on_stage("model_created")
 
                         # Load DMC checkpoint directly to GPU
@@ -198,7 +324,6 @@
                 raise
 
             finally:
-                # Clean up Megatron's parallel state only if it was initialized
                 try:
                     if initialized_parallel:
                         parallel_state.destroy_model_parallel()
diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt
index fc435c0..cb30dfa 100644
--- a/dyana/loaders/megatron/requirements.txt
+++ b/dyana/loaders/megatron/requirements.txt
@@ -13,6 +13,7 @@ hydra-core==1.3.2
 hydra_colorlog==1.2.0
 nltk
 datasets
+transformers>=4.38.0
 
 # Utilities
 psutil>=5.6.7
\ No newline at end of file
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
index 2aa2b75..db50ae8 100644
--- a/dyana/loaders/megatron/settings.yml
+++ b/dyana/loaders/megatron/settings.yml
@@ -5,12 +5,7 @@ build_args:
 
 args:
   - name: model
-    description: Path to Megatron model checkpoint
-    required: true
-    volume: true
-
-  - name: tokenizer
-    description: Path to Llama 2 tokenizer model
+    description: Path to model checkpoint (tokenizer should be in same directory)
     required: true
     volume: true
 
@@ -25,5 +20,5 @@ args:
     required: false
 
 examples:
-  - description: "Load a Megatron-DMC model with tokenizer:"
-    command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B
+  - description: "Load a Megatron-DMC model:"
+    command: dyana trace --loader megatron --model /path/to/model.pt --size 7B

From 47155027cddb3b8652035e1d32f3a4d44683a1c8 Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Wed, 5 Feb 2025 16:02:24 -0500
Subject: [PATCH 14/16] fix: missing tokenizer non mandatory

---
 dyana/loaders/megatron/settings.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
index db50ae8..59dfe7f 100644
--- a/dyana/loaders/megatron/settings.yml
+++ b/dyana/loaders/megatron/settings.yml
@@ -19,6 +19,13 @@ args:
     default: "This is an example prompt."
     required: false
 
+  - name: tokenizer
+    description: Optional explicit path to tokenizer file (otherwise auto-detected)
+    required: false
+    volume: true
+
 examples:
-  - description: "Load a Megatron-DMC model:"
+  - description: "Load a Megatron-DMC model with auto-detected tokenizer:"
     command: dyana trace --loader megatron --model /path/to/model.pt --size 7B
+  - description: "Load model with explicit tokenizer path:"
+    command: dyana trace --loader megatron --model /path/to/model.pt --size 7B --tokenizer /path/to/tokenizer.model

From 420d6ebc484afdf636471d8b0ca2e913c917479d Mon Sep 17 00:00:00 2001
From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com>
Date: Wed, 5 Feb 2025 16:10:43 -0500
Subject: [PATCH 15/16] chore: stop hating on jit

---
 dyana/loaders/megatron/main.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index e45d3ad..7964df8 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -85,7 +85,6 @@ def find_tokenizer(model_path: Path) -> Path:
             # Initialize CUDA
             os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
             os.environ["TORCH_USE_CUDA_DSA"] = "0"
-            os.environ["PYTORCH_JIT"] = "0"  # Disable JIT at env level
             os.environ["TORCH_USE_RTLD_GLOBAL"] = "1"
             os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1"  # Disable CUDA graphs
 
@@ -93,13 +92,6 @@ def find_tokenizer(model_path: Path) -> Path:
                 print("Warning: /dev/shm not found, creating...", file=sys.stderr)
                 os.makedirs("/dev/shm", exist_ok=True)
 
-            # PyTorch before other imports
-            print("=== Configuring PyTorch ===", file=sys.stderr)
-            # Disable JIT compilation using available methods
-            if hasattr(torch._C, "_jit_set_profiling_mode"):
-                torch._C._jit_set_profiling_mode(False)
-                print("✓ Disabled JIT profiling mode", file=sys.stderr)
-
             profiler = Profiler(gpu=True)
 
             if not torch.cuda.is_available():

From a37fe39e52e8eb2d890661d31bae9f2c30f66d95 Mon Sep 17 00:00:00 2001
From: evilsocket <evilsocket@gmail.com>
Date: Thu, 6 Feb 2025 16:45:37 +0100
Subject: [PATCH 16/16] some cleaning

---
 dyana/loaders/megatron/Dockerfile   |  28 +-
 dyana/loaders/megatron/main.py      | 381 ++++++++--------------------
 dyana/loaders/megatron/settings.yml |   3 +-
 3 files changed, 105 insertions(+), 307 deletions(-)

diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
index 80384b9..6168a62 100644
--- a/dyana/loaders/megatron/Dockerfile
+++ b/dyana/loaders/megatron/Dockerfile
@@ -45,30 +45,6 @@ RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /
     cd /app/Megatron-LM && \
     pip install -e .
 
-ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH
+ENV PYTHONPATH=/app/workspac:/app/Megatron-LM:$PYTHONPATH
 
-# Create directories for IPC
-RUN mkdir -p /dev/shm && \
-    mkdir -p /tmp/pytorch_extensions && \
-    chmod -R 777 /dev/shm /tmp/pytorch_extensions
-
-# Create entrypoint script
-RUN printf '#!/bin/bash\n\
-    python3 -c "import torch; assert torch.cuda.is_available(), \"CUDA is not available\"; device=torch.cuda.get_device_name(); print(f\"CUDA OK: {device}\")" && \
-    export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH && \
-    exec python3 -W ignore main.py "$@"' > /app/workspace/entrypoint.sh && \
-    chmod +x /app/workspace/entrypoint.sh
-
-# Verify files exist and perms
-RUN ls -la /app/workspace && \
-    ls -la /app/workspace/entrypoint.sh && \
-    test -x /app/workspace/entrypoint.sh
-
-# Set proper ownership and permissions
-RUN chown -R root:root /app && \
-    chmod -R 755 /app && \
-    chmod +x /app/workspace/entrypoint.sh
-
-# Use bash as entrypoint shell
-SHELL ["/bin/bash", "-c"]
-ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""]
\ No newline at end of file
+ENTRYPOINT ["python3", "-W", "ignore", "main.py"]
\ No newline at end of file
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
index 7964df8..c0de51c 100644
--- a/dyana/loaders/megatron/main.py
+++ b/dyana/loaders/megatron/main.py
@@ -1,25 +1,23 @@
-# ruff: noqa: I001, F401, E402, B904, F821
-# type: ignore
-import os
-import sys
+import argparse
 import logging
+import sys
 import warnings
-import argparse
+import multiprocessing
 from pathlib import Path
-from io import StringIO
-import contextlib
 
 logging.basicConfig(level=logging.ERROR)
 warnings.filterwarnings("ignore", category=UserWarning)
 
-# Import torch and configure CUDA
 import torch
 
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-if torch.cuda.is_available():
-    torch.cuda.init()
-    torch.cuda.set_device(0)
+multiprocessing.set_start_method("spawn", force=True)
+
+import transformer_engine.pytorch as te
+from megatron.core import parallel_state
+from megatron.core.transformer.transformer_config import TransformerConfig
+from transformers import LlamaTokenizer
+
+from dyana import Profiler
 
 
 def find_tokenizer(model_path: Path) -> Path:
@@ -41,28 +39,19 @@ def find_tokenizer(model_path: Path) -> Path:
     if model_path.parent.parent.exists():
         search_dirs.append(model_path.parent.parent)
 
-    print("\n=== Tokenizer Search ===", file=sys.stderr)
-
     for directory in search_dirs:
-        print(f"Looking in: {directory}", file=sys.stderr)
-        print("Directory contents:", file=sys.stderr)
         all_files = list(directory.glob("*"))
         for f in sorted(all_files):
             print(f"  {f}", file=sys.stderr)
             # If it looks like a LLaMA tokenizer file, try it first
             if "tokenizer" in f.name.lower() and f.name.endswith(".model"):
-                print(f"Found likely LLaMA tokenizer: {f}", file=sys.stderr)
                 return f
 
         # If no obvious tokenizer found, try the patterns
-        print("\nTrying patterns:", file=sys.stderr)
         for pattern in patterns:
-            print(f"  {pattern}...", file=sys.stderr, end=" ")
             matches = list(directory.glob(pattern))
             if matches:
-                print(f"Found: {matches[0]}", file=sys.stderr)
                 return matches[0]
-            print("No match", file=sys.stderr)
 
     raise FileNotFoundError(
         f"No tokenizer found in {[str(d) for d in search_dirs]} after trying patterns: {patterns}\n"
@@ -70,263 +59,97 @@ def find_tokenizer(model_path: Path) -> Path:
     )
 
 
-if __name__ == "__main__":
-    # Set multiprocessing start method
-    import multiprocessing
-
-    multiprocessing.set_start_method("spawn", force=True)
-
-    captured_output = StringIO()
-    with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
-        try:
-            print("=== Starting Megatron Loader ===", file=sys.stderr)
-            from dyana import Profiler
-
-            # Initialize CUDA
-            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-            os.environ["TORCH_USE_CUDA_DSA"] = "0"
-            os.environ["TORCH_USE_RTLD_GLOBAL"] = "1"
-            os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1"  # Disable CUDA graphs
-
-            if not os.path.exists("/dev/shm"):
-                print("Warning: /dev/shm not found, creating...", file=sys.stderr)
-                os.makedirs("/dev/shm", exist_ok=True)
-
-            profiler = Profiler(gpu=True)
-
-            if not torch.cuda.is_available():
-                raise RuntimeError("CUDA is not available but required")
-
-            # Force CUDA initialization
-            torch.cuda.init()
-            torch.cuda.set_device(0)
-            # Allocate a small tensor to ensure CUDA is working
-            test_tensor = torch.zeros(1, device="cuda")
-            del test_tensor
-            torch.cuda.empty_cache()
-
-            # GPU info
-            device_name = torch.cuda.get_device_name()
-            device_count = torch.cuda.device_count()
-            cuda_version = torch.version.cuda
-            gpu_mem = torch.cuda.get_device_properties(0).total_memory
-            print(
-                f"Found {device_count} CUDA devices, using {device_name} with {gpu_mem / 1e9:.1f}GB memory",
-                file=sys.stderr,
-            )
-            profiler.track(
-                "gpu_info", {"device": device_name, "count": device_count, "cuda": cuda_version, "memory": gpu_mem}
-            )
-            profiler.on_stage("cuda_initialized")
-
-            print("\n=== Importing Dependencies ===", file=sys.stderr)
-            try:
-                from transformers import LlamaTokenizer
-
-                print("✓ Imported LlamaTokenizer", file=sys.stderr)
-                from megatron.core import parallel_state
-
-                print("✓ Imported parallel_state", file=sys.stderr)
-                from megatron.core.transformer.transformer_config import TransformerConfig
-
-                print("✓ Imported TransformerConfig", file=sys.stderr)
-            except Exception as e:
-                print(f"Failed to import dependencies: {e}", file=sys.stderr)
-                profiler.track_error("imports", str(e))
-                raise
-
-            print("\n=== Parsing Arguments ===", file=sys.stderr)
-            parser = argparse.ArgumentParser()
-            parser.add_argument("--model", required=True)
-            parser.add_argument("--size", choices=["7B", "13B"], required=True)
-            parser.add_argument("--input", default="This is an example prompt.")
-            parser.add_argument("--tokenizer", help="Optional explicit tokenizer path")
-            args = parser.parse_args()
-
-            model_path = Path(args.model)
-            if not model_path.exists():
-                raise FileNotFoundError(f"Model not found at {model_path}")
-
-            print("\n=== Checking Files ===", file=sys.stderr)
-            print(f"Model path: {model_path}", file=sys.stderr)
-            print("Directory contents:", file=sys.stderr)
-            for f in sorted(model_path.parent.glob("*")):
-                print(f"  {f}", file=sys.stderr)
+def load_tokenizer(args) -> LlamaTokenizer:
+    if args.tokenizer:
+        tokenizer_path = Path(args.tokenizer)
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+    else:
+        # Otherwise search for tokenizer
+        tokenizer_path = find_tokenizer(model_path)
 
-            # Try explicit tokenizer path
-            if args.tokenizer:
-                tokenizer_path = Path(args.tokenizer)
-                if not tokenizer_path.exists():
-                    raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
-                print(f"Using provided tokenizer: {tokenizer_path}", file=sys.stderr)
-            else:
-                # Otherwise search for tokenizer
-                tokenizer_path = find_tokenizer(model_path)
-                print(f"Found tokenizer: {tokenizer_path}", file=sys.stderr)
-
-            try:
-                print("\n=== Loading Tokenizer ===", file=sys.stderr)
-                print(f"Loading from: {tokenizer_path}", file=sys.stderr)
-
-                try:
-                    tokenizer = LlamaTokenizer.from_pretrained(
-                        str(tokenizer_path.parent),
-                        local_files_only=True,
-                        tokenizer_file=str(tokenizer_path.name),
-                    )
-                    print(f"Successfully loaded tokenizer (vocab_size={tokenizer.vocab_size})", file=sys.stderr)
-                except Exception as e:
-                    print(f"Failed to load tokenizer from {tokenizer_path}: {e}", file=sys.stderr)
-                    raise
-                print("=======================\n", file=sys.stderr)
-                profiler.on_stage("tokenizer_loaded")
-            except Exception as e:
-                print(f"Error loading tokenizer: {e}", file=sys.stderr)
-                profiler.track_error("tokenizer", str(e))
-                raise
-
-            # Initialize profiler first
-            initialized_parallel = False
-
-            try:
-                # Use fork multiprocessing
-                if sys.platform == "linux":
-                    import torch.multiprocessing as mp
-
-                    mp.set_start_method("fork", force=True)
-
-                if torch.cuda.is_available():
-                    print("=== Runtime Configuration ===")
-                    print(f"PyTorch: {torch.__version__}")
-                    print(f"CUDA: {torch.version.cuda}")
-                    print(f"Device: {torch.cuda.get_device_name()}")
-                    print("===========================")
-                    profiler.on_stage("cuda_verified")
-
-                if torch.cuda.is_available():
-                    import transformer_engine.pytorch as te
-
-                    try:
-                        te.initialize()
-                        print(f"Initialized Transformer Engine version: {te.__version__}")
-                    except Exception as e:
-                        print(f"Warning: Transformer Engine initialization failed: {e}")
-
-                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-                try:
-                    print(f"Transformer Engine version: {te.__version__}")  # noqa: F821
-                    print(f"CUDA devices: {torch.cuda.device_count()}")
-                    print(f"CUDA version: {torch.version.cuda}")
-                    profiler.track(
-                        "env_info",
-                        {
-                            "te_version": te.__version__,  # noqa: F821
-                            "cuda_devices": torch.cuda.device_count(),
-                            "cuda_version": torch.version.cuda,
-                        },
-                    )
-
-                    # Megatron's tensor parallel
-                    world_size = torch.cuda.device_count()
-                    parallel_state.initialize_model_parallel(
-                        tensor_model_parallel_size=1,  # No tensor parallelism for now
-                        pipeline_model_parallel_size=1,  # No pipeline parallelism
-                    )
-                    profiler.on_stage("megatron_initialized")
-
-                    # parallel state initialization
-                    initialized_parallel = True
-
-                    # Model config
-                    model_config = {
-                        "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
-                        "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
-                    }[args.size]
-
-                    # Megatron transformer config
-                    config = TransformerConfig(
-                        num_layers=model_config["num_layers"],
-                        hidden_size=model_config["hidden_size"],
-                        num_attention_heads=model_config["num_attention_heads"],
-                        max_position_embeddings=4096,
-                        init_method_std=0.02,
-                        use_scaled_init_method=True,
-                        attention_softmax_in_fp32=True,
-                        rotary_pct=0.25,  # LLaMA uses rotary embeddings
-                    )
-                    profiler.track("model_config", model_config)
-                    profiler.on_stage("config_created")
-
-                    try:
-                        # Load tokenizer
-                        print("\n=== Loading Tokenizer ===", file=sys.stderr)
-                        print(f"Loading from: {tokenizer_path}", file=sys.stderr)
-                        tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
-                        print(f"Loaded tokenizer with vocab size: {tokenizer.vocab_size}", file=sys.stderr)
-                        print("=======================\n", file=sys.stderr)
-                        profiler.on_stage("tokenizer_loaded")
-
-                        model = GPTModel(  # noqa: F821
-                            config=config,
-                            vocab_size=tokenizer.vocab_size,
-                            max_sequence_length=4096,
-                            parallel_output=False,
-                            share_embeddings_and_output_weights=True,
-                        ).cuda()  # Explicit GPU
-                        profiler.on_stage("model_created")
-
-                        # Load DMC checkpoint directly to GPU
-                        checkpoint = torch.load(str(model_path), map_location="cuda")
-                        model.load_state_dict(checkpoint)
-                        model.eval()
-                        torch.cuda.synchronize()  # Ensure model is loaded to GPU
-                        profiler.on_stage("model_loaded")
-
-                        # Run inference
-                        input_ids = tokenizer(args.input, return_tensors="pt").to(device)
-                        with torch.no_grad():
-                            output = model(input_ids=input_ids["input_ids"])
-                            logits = output.logits
-                            next_token = torch.argmax(logits[:, -1, :], dim=-1)
-                            generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
-                            text = tokenizer.decode(generated[0], skip_special_tokens=True)
-                            profiler.track("output", text)
-                            profiler.on_stage("inference_complete")
-
-                    except Exception as e:
-                        profiler.track_error("model", str(e))
-                        print(f"Model loading/inference failed: {e}")
-                        if torch.cuda.is_available():
-                            torch.cuda.empty_cache()
-                        raise
-
-                except Exception as e:
-                    print(f"Error occurred: {str(e)}")
-                    profiler.track_error("model", str(e))
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-                    raise
-
-            except Exception as e:
-                profiler.track_error("setup", str(e))
-                print(f"Setup error: {e}")
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                raise
+    return LlamaTokenizer.from_pretrained(
+        str(tokenizer_path.parent),
+        local_files_only=True,
+        tokenizer_file=str(tokenizer_path.name),
+    )
 
-            finally:
-                try:
-                    if initialized_parallel:
-                        parallel_state.destroy_model_parallel()
-                except Exception as e:
-                    profiler.track_error("cleanup", str(e))
-                    print(f"Cleanup error: {e}")
 
-        except Exception as e:
-            profiler.track_error("runtime", str(e))
-            print(f"Error: {e}", file=sys.stderr)
-            raise
-        finally:
-            profiler.flush()
-            print(captured_output.getvalue(), file=sys.stderr)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--size", choices=["7B", "13B"], default="7B")
+    parser.add_argument("--input", default="This is an example prompt.")
+    parser.add_argument("--tokenizer", help="Optional explicit tokenizer path")
+    args = parser.parse_args()
+
+    model_config = {
+        "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
+        "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
+    }[args.size]
+
+    profiler = Profiler(gpu=True)
+
+    try:
+        model_path = Path(args.model)
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model not found at {model_path}")
+
+        tokenizer = load_tokenizer(args)
+        profiler.on_stage("tokenizer_loaded")
+
+        te.initialize()
+
+        has_gpu = torch.cuda.is_available()
+        device = torch.device("cuda" if has_gpu else "cpu")
+
+        # Megatron's tensor parallel
+        parallel_state.initialize_model_parallel(
+            tensor_model_parallel_size=1,  # No tensor parallelism for now
+            pipeline_model_parallel_size=1,  # No pipeline parallelism
+        )
+        profiler.on_stage("megatron_initialized")
+
+        # Megatron transformer config
+        config = TransformerConfig(
+            num_layers=model_config["num_layers"],
+            hidden_size=model_config["hidden_size"],
+            num_attention_heads=model_config["num_attention_heads"],
+            max_position_embeddings=4096,
+            init_method_std=0.02,
+            use_scaled_init_method=True,
+            attention_softmax_in_fp32=True,
+            rotary_pct=0.25,  # LLaMA uses rotary embeddings
+        )
+
+        model = GPTModel(  # noqa: F821
+            config=config,
+            vocab_size=tokenizer.vocab_size,
+            max_sequence_length=4096,
+            parallel_output=False,
+            share_embeddings_and_output_weights=True,
+        )
+        if has_gpu:
+            model = model.cuda()
+
+        profiler.on_stage("model_created")
+
+        # Load DMC checkpoint directly to GPU
+        checkpoint = torch.load(str(model_path), map_location=device)
+        model.load_state_dict(checkpoint)
+        model.eval()
+        profiler.on_stage("model_loaded")
+
+        # Run inference
+        input_ids = tokenizer(args.input, return_tensors="pt").to(device)
+        with torch.no_grad():
+            output = model(input_ids=input_ids["input_ids"])
+            logits = output.logits
+            next_token = torch.argmax(logits[:, -1, :], dim=-1)
+            generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
+            text = tokenizer.decode(generated[0], skip_special_tokens=True)
+            profiler.track("output", text)
+            profiler.on_stage("inference_complete")
+
+    except Exception as e:
+        profiler.track_error("megatron", str(e))
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
index 59dfe7f..608ae21 100644
--- a/dyana/loaders/megatron/settings.yml
+++ b/dyana/loaders/megatron/settings.yml
@@ -11,8 +11,7 @@ args:
 
   - name: size
     description: Model size (7B or 13B)
-    required: true
-    choices: ["7B", "13B"]
+    required: false
 
   - name: input
     description: Input text for inference