Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: custom [megatron] nvidia dmc loader #39

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dyana/loaders/megatron/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dyana.py
dyana-requirements.txt
dyana-requirements-gpu.txt
75 changes: 75 additions & 0 deletions dyana/loaders/megatron/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
FROM nvcr.io/nvidia/pytorch:24.04-py3

WORKDIR /app

# Install system dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
ca-certificates \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Configure environment
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV CUDA_LAUNCH_BLOCKING=1
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
ENV CUDA_MODULE_LOADING=LAZY
ENV TORCH_USE_CUDA_DSA=1
ENV CUDA_DEVICE_MAX_CONNECTIONS=1
ENV NCCL_ASYNC_ERROR_HANDLING=1
ENV OMP_NUM_THREADS=1
ENV NVTE_FRAMEWORK=pytorch
ENV MAX_JOBS=4
ENV DEBIAN_FRONTEND=noninteractive
ENV TORCH_CUDNN_V8_API_ENABLED=1
ENV TORCH_ALLOW_TF32=1
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
GangGreenTemperTatum marked this conversation as resolved.
Show resolved Hide resolved
ENV PYTORCH_JIT=0
ENV TORCH_COMPILE_DEBUG=1
ENV TORCH_INDUCTOR_VAR_NAMES=1

# Only verify PyTorch during build
RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"

# Create working directory
RUN mkdir -p /app/workspace

# Copy files in correct order
COPY requirements.txt /app/workspace/
COPY *.py /app/workspace/
COPY dyana-requirements*.txt /app/workspace/

WORKDIR /app/workspace

# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Install Megatron-LM
RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /app/Megatron-LM && \
cd /app/Megatron-LM && \
pip install -e .

ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH

# Create simpler entrypoint script
RUN printf '#!/bin/bash\n\
export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
export PYTORCH_NO_CUDA_MEMORY_CACHING=1\n\
exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
chmod +x /app/workspace/entrypoint.sh

# Verify files exist and have correct permissions
RUN ls -la /app/workspace && \
ls -la /app/workspace/entrypoint.sh && \
test -x /app/workspace/entrypoint.sh

# Set proper ownership and permissions
RUN chown -R root:root /app && \
chmod -R 755 /app && \
chmod +x /app/workspace/entrypoint.sh

# Use full path in entrypoint
ENTRYPOINT ["/app/workspace/entrypoint.sh"]
166 changes: 166 additions & 0 deletions dyana/loaders/megatron/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import argparse
import os
from pathlib import Path

import torch
import transformer_engine as te
from megatron.model.gpt_model import GPTModel

from dyana.profiler import Profiler


def verify_cuda_setup() -> None:
"""Verify CUDA and PyTorch setup before model loading"""
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available")

# Disable JIT/Inductor features
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(False)

print("=== Runtime Configuration ===")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")
print(f"Device: {torch.cuda.get_device_name()}")
print("===========================")

torch.cuda.set_device(0)


if __name__ == "__main__":
profiler = Profiler(gpu=True)

try:
# Verify CUDA setup
verify_cuda_setup()
profiler.on_stage("cuda_verified")

os.environ["TE_VERBOSE"] = "1"
os.environ["NVTE_FRAMEWORK"] = "pytorch"
print("Starting Megatron loader with verbose logging...")

# Initialize CUDA and Transformer Engine
if torch.cuda.is_available():
import transformer_engine.pytorch as te

te.initialize()
print(f"Initialized Transformer Engine version: {te.__version__}")

from megatron.core import parallel_state
from megatron.core.transformer.transformer_config import TransformerConfig
from transformers import LlamaTokenizer

parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--tokenizer", required=True)
parser.add_argument("--size", choices=["7B", "13B"], required=True)
parser.add_argument("--input", default="This is an example prompt.")
args = parser.parse_args()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
print(f"Transformer Engine version: {te.__version__}")
print(f"CUDA devices: {torch.cuda.device_count()}")
print(f"CUDA version: {torch.version.cuda}")
profiler.track(
"env_info",
{
"te_version": te.__version__,
"cuda_devices": torch.cuda.device_count(),
"cuda_version": torch.version.cuda,
},
)

model_path = Path(args.model)
tokenizer_path = Path(args.tokenizer)
if not model_path.exists():
raise FileNotFoundError(f"Model not found at {model_path}")
if not tokenizer_path.exists():
raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")

# Initialize Megatron's tensor parallel
world_size = torch.cuda.device_count()
parallel_state.initialize_model_parallel(
tensor_model_parallel_size=1, # No tensor parallelism for now
pipeline_model_parallel_size=1, # No pipeline parallelism
)
profiler.on_stage("megatron_initialized")

# Model config based on size
model_config = {
"7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
"13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
}[args.size]

config = TransformerConfig(
num_layers=model_config["num_layers"],
hidden_size=model_config["hidden_size"],
num_attention_heads=model_config["num_attention_heads"],
max_position_embeddings=4096,
init_method_std=0.02,
use_scaled_init_method=True,
attention_softmax_in_fp32=True,
rotary_pct=0.25, # LLaMA uses rotary embeddings
)
profiler.track("model_config", model_config)
profiler.on_stage("config_created")

try:
tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
profiler.on_stage("tokenizer_loaded")

model = GPTModel(
config=config,
vocab_size=tokenizer.vocab_size,
max_sequence_length=4096,
parallel_output=False,
share_embeddings_and_output_weights=True,
)
profiler.on_stage("model_created")

checkpoint = torch.load(str(model_path), map_location=device)
model.load_state_dict(checkpoint)
model.cuda()
model.eval()
profiler.on_stage("model_loaded")

input_ids = tokenizer(args.input, return_tensors="pt").to(device)
with torch.no_grad():
output = model(input_ids=input_ids["input_ids"])
logits = output.logits
next_token = torch.argmax(logits[:, -1, :], dim=-1)
generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
text = tokenizer.decode(generated[0], skip_special_tokens=True)
profiler.track("output", text)
profiler.on_stage("inference_complete")

except Exception as e:
profiler.track_error("model", str(e))
print(f"Model loading/inference failed: {e}")
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

except Exception as e:
print(f"Error occurred: {str(e)}")
profiler.track_error("model", str(e))
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

except Exception as e:
profiler.track_error("setup", str(e))
print(f"Setup error: {e}")
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

finally:
try:
parallel_state.destroy_model_parallel()
except Exception as e:
profiler.track_error("cleanup", str(e))
print(f"Cleanup error: {e}")
18 changes: 18 additions & 0 deletions dyana/loaders/megatron/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
--extra-index-url https://download.pytorch.org/whl/cu121
--find-links https://developer.download.nvidia.com/compute/redist

# Base dependencies from Megatron core
torch>=2.0.0
packaging>=20.0
typing_extensions>=4.0.0

# Megatron DMC dependencies
flash-attn==2.6.1
sentencepiece==0.2.0
hydra-core==1.3.2
hydra_colorlog==1.2.0
nltk
datasets

# Utilities
psutil>=5.6.7
29 changes: 29 additions & 0 deletions dyana/loaders/megatron/settings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
description: Loads and profiles Megatron-LM DMC models for efficient inference

build_args:
extra-requirements: EXTRA_REQUIREMENTS

args:
- name: model
description: Path to Megatron model checkpoint
required: true
volume: true

- name: tokenizer
description: Path to Llama 2 tokenizer model
required: true
volume: true

- name: size
description: Model size (7B or 13B)
required: true
choices: ["7B", "13B"]

- name: input
description: Input text for inference
default: "This is an example prompt."
required: false

examples:
- description: "Load a Megatron-DMC model with tokenizer:"
command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B
Loading