dreadnode · GangGreenTemperTatum · Jan 31, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/dyana/loaders/megatron/.gitignore b/dyana/loaders/megatron/.gitignore
@@ -0,0 +1,3 @@
+dyana.py
+dyana-requirements.txt
+dyana-requirements-gpu.txt
diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile
@@ -0,0 +1,87 @@
+FROM nvcr.io/nvidia/pytorch:24.04-py3
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Configure environment
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV CUDA_LAUNCH_BLOCKING=1
+ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
+ENV CUDA_MODULE_LOADING=LAZY
+ENV TORCH_USE_CUDA_DSA=0
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV NCCL_ASYNC_ERROR_HANDLING=1
+ENV OMP_NUM_THREADS=1
+ENV NVTE_FRAMEWORK=pytorch
+ENV MAX_JOBS=4
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TORCH_CUDNN_V8_API_ENABLED=1
+ENV TORCH_ALLOW_TF32=1
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+ENV PYTORCH_JIT=0
+ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1
+ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0
+ENV PYTHONFAULTHANDLER=1
+ENV PYTHONUNBUFFERED=1
+ENV NCCL_IB_DISABLE=1
+ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1
+ENV TORCH_SHOW_CPP_STACKTRACES=0
+ENV PYTHONWARNINGS=ignore
+
+# Only verify PyTorch during build
+RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+
+# Create working directory
+RUN mkdir -p /app/workspace
+
+# Copy files in correct order
+COPY requirements.txt /app/workspace/
+COPY *.py /app/workspace/
+COPY dyana-requirements*.txt /app/workspace/
+
+WORKDIR /app/workspace
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install Megatron-LM
+RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /app/Megatron-LM && \
+    cd /app/Megatron-LM && \
+    pip install -e .
+
+ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH
+
+# Create directories for IPC
+RUN mkdir -p /dev/shm && \
+    mkdir -p /tmp/pytorch_extensions && \
+    chmod -R 777 /dev/shm /tmp/pytorch_extensions
+
+# Create simpler entrypoint script with proper environment
+RUN printf '#!/bin/bash\n\
+    # Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\
+    export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
+    exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
+    chmod +x /app/workspace/entrypoint.sh
+
+# Verify files exist and have correct permissions
+RUN ls -la /app/workspace && \
+    ls -la /app/workspace/entrypoint.sh && \
+    test -x /app/workspace/entrypoint.sh
+
+# Set proper ownership and permissions
+RUN chown -R root:root /app && \
+    chmod -R 755 /app && \
+    chmod +x /app/workspace/entrypoint.sh
+
+# Use bash as entrypoint shell
+SHELL ["/bin/bash", "-c"]
+ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""]
diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py
@@ -0,0 +1,219 @@
+# ruff: noqa: I001, E402
+# type: ignore
+import os
+import sys
+import logging
+import warnings
+import argparse
+from pathlib import Path
+from io import StringIO
+import contextlib
+
+logging.basicConfig(level=logging.ERROR)
+warnings.filterwarnings("ignore", category=UserWarning)
+os.environ["PYTHONWARNINGS"] = "ignore"
+
+os.environ.update(
+    {
+        "CUDA_LAUNCH_BLOCKING": "1",
+        "PYTORCH_NO_CUDA_MEMORY_CACHING": "1",
+        "TORCH_USE_CUDA_DSA": "0",
+        "NVTE_FRAMEWORK": "pytorch",
+        "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32",
+        "TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1",
+        "TORCH_INDUCTOR_USE_PYTHON_BINDING": "0",
+        "TORCH_SHOW_CPP_STACKTRACES": "0",
+    }
+)
+
+import torch  # noqa: E402
+
+torch._C._jit_set_nvfuser_enabled(False)
+torch._C._jit_set_texpr_fuser_enabled(False)
+torch._C._jit_override_can_fuse_on_cpu(False)
+torch._C._jit_override_can_fuse_on_gpu(False)
+
+if __name__ == "__main__":
+    captured_output = StringIO()
+    with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
+        try:
+            from dyana import Profiler
+
+            profiler = Profiler(gpu=True)
+
+            # Initialize CUDA
+            if torch.cuda.is_available():
+                torch.cuda.init()  # type: ignore[no-untyped-call]
+                torch.cuda.set_device(0)
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+                profiler.track(
+                    "cuda_info",
+                    {
+                        "version": torch.version.cuda,
+                        "device": torch.cuda.get_device_name(),
+                        "device_count": torch.cuda.device_count(),
+                    },
+                )
+            profiler.on_stage("cuda_initialized")
+
+            parser = argparse.ArgumentParser()
+            parser.add_argument("--model", required=True)
+            parser.add_argument("--tokenizer", required=True)
+            parser.add_argument("--size", choices=["7B", "13B"], required=True)
+            parser.add_argument("--input", default="This is an example prompt.")
+            args = parser.parse_args()
+
+            model_path = Path(args.model)
+            tokenizer_path = Path(args.tokenizer)
+            if not model_path.exists():
+                raise FileNotFoundError(f"Model not found at {model_path}")
+            if not tokenizer_path.exists():
+                raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
+            profiler.on_stage("args_verified")
+
+            from transformers import LlamaTokenizer
+            from megatron.core import parallel_state
+            from megatron.core.transformer.transformer_config import TransformerConfig
+
+            # Initialize profiler first
+            initialized_parallel = False
+
+            try:
+                # Use fork multiprocessing
+                if sys.platform == "linux":
+                    import torch.multiprocessing as mp
+
+                    mp.set_start_method("fork", force=True)
+
+                if torch.cuda.is_available():
+                    print("=== Runtime Configuration ===")
+                    print(f"PyTorch: {torch.__version__}")
+                    print(f"CUDA: {torch.version.cuda}")
+                    print(f"Device: {torch.cuda.get_device_name()}")
+                    print("===========================")
+                    profiler.on_stage("cuda_verified")
+
+                if torch.cuda.is_available():
+                    import transformer_engine.pytorch as te
+
+                    try:
+                        te.initialize()
+                        print(f"Initialized Transformer Engine version: {te.__version__}")  # noqa: F821
+                    except Exception as e:
+                        print(f"Warning: Transformer Engine initialization failed: {e}")
+
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+                try:
+                    print(f"Transformer Engine version: {transformer_engine.__version__}")  # noqa: F821
+                    print(f"CUDA devices: {torch.cuda.device_count()}")
+                    print(f"CUDA version: {torch.version.cuda}")
+                    profiler.track(
+                        "env_info",
+                        {
+                            "te_version": transformer_engine.__version__,  # noqa: F821
+                            "cuda_devices": torch.cuda.device_count(),
+                            "cuda_version": torch.version.cuda,
+                        },
+                    )
+
+                    # Megatron's tensor parallel
+                    world_size = torch.cuda.device_count()
+                    parallel_state.initialize_model_parallel(
+                        tensor_model_parallel_size=1,  # No tensor parallelism for now
+                        pipeline_model_parallel_size=1,  # No pipeline parallelism
+                    )
+                    profiler.on_stage("megatron_initialized")
+
+                    # parallel state initialization
+                    initialized_parallel = True
+
+                    # Model config
+                    model_config = {
+                        "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
+                        "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
+                    }[args.size]
+
+                    # Megatron transformer config
+                    config = TransformerConfig(
+                        num_layers=model_config["num_layers"],
+                        hidden_size=model_config["hidden_size"],
+                        num_attention_heads=model_config["num_attention_heads"],
+                        max_position_embeddings=4096,
+                        init_method_std=0.02,
+                        use_scaled_init_method=True,
+                        attention_softmax_in_fp32=True,
+                        rotary_pct=0.25,  # LLaMA uses rotary embeddings
+                    )
+                    profiler.track("model_config", model_config)
+                    profiler.on_stage("config_created")
+
+                    try:
+                        tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
+                        profiler.on_stage("tokenizer_loaded")
+
+                        model = GPTModel(  # noqa: F821
+                            config=config,
+                            vocab_size=tokenizer.vocab_size,
+                            max_sequence_length=4096,
+                            parallel_output=False,
+                            share_embeddings_and_output_weights=True,
+                        )
+                        profiler.on_stage("model_created")
+
+                        # Load DMC checkpoint
+                        checkpoint = torch.load(str(model_path), map_location=device)
+                        model.load_state_dict(checkpoint)
+                        model.cuda()
+                        model.eval()
+                        profiler.on_stage("model_loaded")
+
+                        # Run inference
+                        input_ids = tokenizer(args.input, return_tensors="pt").to(device)
+                        with torch.no_grad():
+                            output = model(input_ids=input_ids["input_ids"])
+                            logits = output.logits
+                            next_token = torch.argmax(logits[:, -1, :], dim=-1)
+                            generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
+                            text = tokenizer.decode(generated[0], skip_special_tokens=True)
+                            profiler.track("output", text)
+                            profiler.on_stage("inference_complete")
+
+                    except Exception as e:
+                        profiler.track_error("model", str(e))
+                        print(f"Model loading/inference failed: {e}")
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                        raise
+
+                except Exception as e:
+                    print(f"Error occurred: {str(e)}")
+                    profiler.track_error("model", str(e))
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    raise
+
+            except Exception as e:
+                profiler.track_error("setup", str(e))
+                print(f"Setup error: {e}")
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                raise
+
+            finally:
+                # Clean up Megatron's parallel state only if it was initialized
+                try:
+                    if initialized_parallel:
+                        parallel_state.destroy_model_parallel()
+                except Exception as e:
+                    profiler.track_error("cleanup", str(e))
+                    print(f"Cleanup error: {e}")
+
+        except Exception as e:
+            profiler.track_error("runtime", str(e))
+            print(f"Error: {e}", file=sys.stderr)
+            raise
+        finally:
+            profiler.flush()
+            print(captured_output.getvalue(), file=sys.stderr)
diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt
@@ -0,0 +1,18 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://developer.download.nvidia.com/compute/redist
+
+# Base dependencies from Megatron core
+torch>=2.0.0
+packaging>=20.0
+typing_extensions>=4.0.0
+
+# Megatron DMC dependencies
+flash-attn==2.6.1
+sentencepiece==0.2.0
+hydra-core==1.3.2
+hydra_colorlog==1.2.0
+nltk
+datasets
+
+# Utilities
+psutil>=5.6.7
diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml
@@ -0,0 +1,29 @@
+description: Loads and profiles Megatron-LM DMC models for efficient inference
+
+build_args:
+  extra-requirements: EXTRA_REQUIREMENTS
+
+args:
+  - name: model
+    description: Path to Megatron model checkpoint
+    required: true
+    volume: true
+
+  - name: tokenizer
+    description: Path to Llama 2 tokenizer model
+    required: true
+    volume: true
+
+  - name: size
+    description: Model size (7B or 13B)
+    required: true
+    choices: ["7B", "13B"]
+
+  - name: input
+    description: Input text for inference
+    default: "This is an example prompt."
+    required: false
+
+examples:
+  - description: "Load a Megatron-DMC model with tokenizer:"
+    command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B