Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: custom [megatron] nvidia dmc loader #39

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dyana/loaders/megatron/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dyana.py
dyana-requirements.txt
dyana-requirements-gpu.txt
87 changes: 87 additions & 0 deletions dyana/loaders/megatron/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
FROM nvcr.io/nvidia/pytorch:24.04-py3

WORKDIR /app

# Install system dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
ca-certificates \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Configure environment
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV CUDA_LAUNCH_BLOCKING=1
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
ENV CUDA_MODULE_LOADING=LAZY
ENV TORCH_USE_CUDA_DSA=0
ENV CUDA_DEVICE_MAX_CONNECTIONS=1
ENV NCCL_ASYNC_ERROR_HANDLING=1
ENV OMP_NUM_THREADS=1
ENV NVTE_FRAMEWORK=pytorch
ENV MAX_JOBS=4
ENV DEBIAN_FRONTEND=noninteractive
ENV TORCH_CUDNN_V8_API_ENABLED=1
ENV TORCH_ALLOW_TF32=1
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
GangGreenTemperTatum marked this conversation as resolved.
Show resolved Hide resolved
ENV PYTORCH_JIT=0
ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1
GangGreenTemperTatum marked this conversation as resolved.
Show resolved Hide resolved
ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0
ENV PYTHONFAULTHANDLER=1
ENV PYTHONUNBUFFERED=1
ENV NCCL_IB_DISABLE=1
ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1
GangGreenTemperTatum marked this conversation as resolved.
Show resolved Hide resolved
ENV TORCH_SHOW_CPP_STACKTRACES=0
ENV PYTHONWARNINGS=ignore

# Only verify PyTorch during build
RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"

# Create working directory
RUN mkdir -p /app/workspace

# Copy files in correct order
COPY requirements.txt /app/workspace/
COPY *.py /app/workspace/
COPY dyana-requirements*.txt /app/workspace/

WORKDIR /app/workspace

# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Install Megatron-LM
RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /app/Megatron-LM && \
cd /app/Megatron-LM && \
pip install -e .

ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH

# Create directories for IPC
RUN mkdir -p /dev/shm && \
mkdir -p /tmp/pytorch_extensions && \
chmod -R 777 /dev/shm /tmp/pytorch_extensions

# Create simpler entrypoint script with proper environment
RUN printf '#!/bin/bash\n\
# Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\
export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\
exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \
chmod +x /app/workspace/entrypoint.sh

# Verify files exist and have correct permissions
RUN ls -la /app/workspace && \
ls -la /app/workspace/entrypoint.sh && \
test -x /app/workspace/entrypoint.sh

# Set proper ownership and permissions
RUN chown -R root:root /app && \
chmod -R 755 /app && \
chmod +x /app/workspace/entrypoint.sh

# Use bash as entrypoint shell
SHELL ["/bin/bash", "-c"]
ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""]
219 changes: 219 additions & 0 deletions dyana/loaders/megatron/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
# ruff: noqa: I001, E402
# type: ignore
import os
import sys
import logging
import warnings
import argparse
from pathlib import Path
from io import StringIO
import contextlib

logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["PYTHONWARNINGS"] = "ignore"

os.environ.update(
{
"CUDA_LAUNCH_BLOCKING": "1",
"PYTORCH_NO_CUDA_MEMORY_CACHING": "1",
"TORCH_USE_CUDA_DSA": "0",
"NVTE_FRAMEWORK": "pytorch",
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32",
"TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1",
"TORCH_INDUCTOR_USE_PYTHON_BINDING": "0",
"TORCH_SHOW_CPP_STACKTRACES": "0",
}
)

import torch # noqa: E402

torch._C._jit_set_nvfuser_enabled(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)

if __name__ == "__main__":
captured_output = StringIO()
with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output):
try:
from dyana import Profiler

profiler = Profiler(gpu=True)

# Initialize CUDA
if torch.cuda.is_available():
torch.cuda.init() # type: ignore[no-untyped-call]
torch.cuda.set_device(0)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
profiler.track(
"cuda_info",
{
"version": torch.version.cuda,
"device": torch.cuda.get_device_name(),
"device_count": torch.cuda.device_count(),
},
)
profiler.on_stage("cuda_initialized")

parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--tokenizer", required=True)
parser.add_argument("--size", choices=["7B", "13B"], required=True)
parser.add_argument("--input", default="This is an example prompt.")
args = parser.parse_args()

model_path = Path(args.model)
tokenizer_path = Path(args.tokenizer)
if not model_path.exists():
raise FileNotFoundError(f"Model not found at {model_path}")
if not tokenizer_path.exists():
raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}")
profiler.on_stage("args_verified")

from transformers import LlamaTokenizer
from megatron.core import parallel_state
from megatron.core.transformer.transformer_config import TransformerConfig

# Initialize profiler first
initialized_parallel = False

try:
# Use fork multiprocessing
if sys.platform == "linux":
import torch.multiprocessing as mp

mp.set_start_method("fork", force=True)

if torch.cuda.is_available():
print("=== Runtime Configuration ===")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")
print(f"Device: {torch.cuda.get_device_name()}")
print("===========================")
profiler.on_stage("cuda_verified")

if torch.cuda.is_available():
import transformer_engine.pytorch as te

try:
te.initialize()
print(f"Initialized Transformer Engine version: {te.__version__}") # noqa: F821
except Exception as e:
print(f"Warning: Transformer Engine initialization failed: {e}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
print(f"Transformer Engine version: {transformer_engine.__version__}") # noqa: F821
print(f"CUDA devices: {torch.cuda.device_count()}")
print(f"CUDA version: {torch.version.cuda}")
profiler.track(
"env_info",
{
"te_version": transformer_engine.__version__, # noqa: F821
"cuda_devices": torch.cuda.device_count(),
"cuda_version": torch.version.cuda,
},
)

# Megatron's tensor parallel
world_size = torch.cuda.device_count()
parallel_state.initialize_model_parallel(
tensor_model_parallel_size=1, # No tensor parallelism for now
pipeline_model_parallel_size=1, # No pipeline parallelism
)
profiler.on_stage("megatron_initialized")

# parallel state initialization
initialized_parallel = True

# Model config
model_config = {
"7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32},
"13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40},
}[args.size]

# Megatron transformer config
config = TransformerConfig(
num_layers=model_config["num_layers"],
hidden_size=model_config["hidden_size"],
num_attention_heads=model_config["num_attention_heads"],
max_position_embeddings=4096,
init_method_std=0.02,
use_scaled_init_method=True,
attention_softmax_in_fp32=True,
rotary_pct=0.25, # LLaMA uses rotary embeddings
)
profiler.track("model_config", model_config)
profiler.on_stage("config_created")

try:
tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True)
profiler.on_stage("tokenizer_loaded")

model = GPTModel( # noqa: F821
config=config,
vocab_size=tokenizer.vocab_size,
max_sequence_length=4096,
parallel_output=False,
share_embeddings_and_output_weights=True,
)
profiler.on_stage("model_created")

# Load DMC checkpoint
checkpoint = torch.load(str(model_path), map_location=device)
model.load_state_dict(checkpoint)
model.cuda()
model.eval()
profiler.on_stage("model_loaded")

# Run inference
input_ids = tokenizer(args.input, return_tensors="pt").to(device)
with torch.no_grad():
output = model(input_ids=input_ids["input_ids"])
logits = output.logits
next_token = torch.argmax(logits[:, -1, :], dim=-1)
generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1)
text = tokenizer.decode(generated[0], skip_special_tokens=True)
profiler.track("output", text)
profiler.on_stage("inference_complete")

except Exception as e:
profiler.track_error("model", str(e))
print(f"Model loading/inference failed: {e}")
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

except Exception as e:
print(f"Error occurred: {str(e)}")
profiler.track_error("model", str(e))
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

except Exception as e:
profiler.track_error("setup", str(e))
print(f"Setup error: {e}")
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise

finally:
# Clean up Megatron's parallel state only if it was initialized
try:
if initialized_parallel:
parallel_state.destroy_model_parallel()
except Exception as e:
profiler.track_error("cleanup", str(e))
print(f"Cleanup error: {e}")

except Exception as e:
profiler.track_error("runtime", str(e))
print(f"Error: {e}", file=sys.stderr)
raise
finally:
profiler.flush()
print(captured_output.getvalue(), file=sys.stderr)
18 changes: 18 additions & 0 deletions dyana/loaders/megatron/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
--extra-index-url https://download.pytorch.org/whl/cu121
--find-links https://developer.download.nvidia.com/compute/redist

# Base dependencies from Megatron core
torch>=2.0.0
packaging>=20.0
typing_extensions>=4.0.0

# Megatron DMC dependencies
flash-attn==2.6.1
sentencepiece==0.2.0
hydra-core==1.3.2
hydra_colorlog==1.2.0
nltk
datasets

# Utilities
psutil>=5.6.7
29 changes: 29 additions & 0 deletions dyana/loaders/megatron/settings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
description: Loads and profiles Megatron-LM DMC models for efficient inference

build_args:
extra-requirements: EXTRA_REQUIREMENTS

args:
- name: model
description: Path to Megatron model checkpoint
required: true
volume: true

- name: tokenizer
description: Path to Llama 2 tokenizer model
required: true
volume: true

- name: size
description: Model size (7B or 13B)
required: true
choices: ["7B", "13B"]

- name: input
description: Input text for inference
default: "This is an example prompt."
required: false

examples:
- description: "Load a Megatron-DMC model with tokenizer:"
command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B
Loading