From 84fe6d9b36a00a21b06cacf9994cd63e76b3cd7b Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Fri, 31 Jan 2025 10:19:01 -0500 Subject: [PATCH 01/16] feat: nvidia dmc custom loader --- dyana/loaders/megatron/.gitignore | 3 + dyana/loaders/megatron/Dockerfile | 28 +++++ dyana/loaders/megatron/main.py | 153 ++++++++++++++++++++++++ dyana/loaders/megatron/requirements.txt | 28 +++++ dyana/loaders/megatron/settings.yml | 29 +++++ 5 files changed, 241 insertions(+) create mode 100644 dyana/loaders/megatron/.gitignore create mode 100644 dyana/loaders/megatron/Dockerfile create mode 100644 dyana/loaders/megatron/main.py create mode 100644 dyana/loaders/megatron/requirements.txt create mode 100644 dyana/loaders/megatron/settings.yml diff --git a/dyana/loaders/megatron/.gitignore b/dyana/loaders/megatron/.gitignore new file mode 100644 index 0000000..3d0dd7e --- /dev/null +++ b/dyana/loaders/megatron/.gitignore @@ -0,0 +1,3 @@ +dyana.py +dyana-requirements.txt +dyana-requirements-gpu.txt \ No newline at end of file diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile new file mode 100644 index 0000000..a487cff --- /dev/null +++ b/dyana/loaders/megatron/Dockerfile @@ -0,0 +1,28 @@ +FROM nvcr.io/nvidia/pytorch:24.04-py3 + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Clone Megatron-LM dmc branch and install +RUN git clone -b dmc https://github.com/NVIDIA/Megatron-LM.git && \ + cd Megatron-LM && \ + pip install -e . + +# Copy loader files +COPY . . + +# Install requirements +RUN pip install --no-cache-dir -r dyana-requirements-gpu.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Environment setup +ENV CUDA_DEVICE_MAX_CONNECTIONS=1 +ENV PYTHONUNBUFFERED=1 +ENV MEGATRON_DEBUG=1 + +ENTRYPOINT ["python3", "-u", "main.py"] \ No newline at end of file diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py new file mode 100644 index 0000000..c98e289 --- /dev/null +++ b/dyana/loaders/megatron/main.py @@ -0,0 +1,153 @@ +import os +import sys +import torch +from pathlib import Path +from dyana import Profiler + +from megatron.core import parallel_state +from megatron.core.models.gpt import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.training import get_args, get_model +from megatron.training.arguments import parse_args, core_transformer_config_from_args +from megatron.training.initialize import initialize_megatron +from megatron.training.checkpointing import load_checkpoint +from megatron.contrib.dmc import add_dmc_layer + + +def setup_megatron_args(model_size: str, model_path: str, tokenizer_path: str): + """Setup Megatron arguments""" + print("Debug: Starting argument setup") + sys.argv = [sys.argv[0]] + + args = [ + "--tensor-model-parallel-size", + "1", + "--pipeline-model-parallel-size", + "1", + "--load", + model_path, + "--tokenizer-model", + tokenizer_path, + "--tokenizer-type", + "Llama2Tokenizer", + "--bf16", + "--seq-length", + "4096", + "--max-position-embeddings", + "4096", + "--num-layers", + "32" if model_size == "7B" else "40", + "--hidden-size", + "4096" if model_size == "7B" else "5120", + "--num-attention-heads", + "32" if model_size == "7B" else "40", + "--micro-batch-size", + "1", + "--global-batch-size", + "1", + "--no-masked-softmax-fusion", + "--no-load-optim", + "--no-load-rng", + "--skip-train", + "--fp16", + "--use-cpu-initialization", # avoid CUDA deadlocks + "--tokenizer-type", + "Llama2Tokenizer", + ] + + print("Debug: Setting sys.argv") + sys.argv.extend(args) + + print("Debug: Parsing args") + args = parse_args() + + print("Debug: Initializing Megatron") + initialize_megatron(args_defaults={"no_load_optim": True, "no_load_rng": True}) + + return get_args() + + +def model_provider(pre_process=True, post_process=True): + """Model provider for Megatron to load the model.""" + print("Debug: Setting up model provider") + args = get_args() + config = core_transformer_config_from_args(args) + + print("Debug: Creating model") + model = GPTModel( + config=config, + transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + ) + + return model + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--tokenizer", required=True) + parser.add_argument("--size", choices=["7B", "13B"], required=True) + parser.add_argument("--input", default="This is an example prompt.") + args = parser.parse_args() + + profiler = Profiler(gpu=True) + + try: + # verify files + model_path = Path(args.model) + tokenizer_path = Path(args.tokenizer) + if not model_path.exists(): + raise FileNotFoundError(f"Model not found at {model_path}") + if not tokenizer_path.exists(): + raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") + + print("Debug: Starting initialization") + profiler.on_stage("initializing") + + print("Debug: Setting up args") + args = setup_megatron_args(args.size, str(model_path), str(tokenizer_path)) + + print("Debug: Initializing model parallel") + torch.cuda.empty_cache() + parallel_state.set_tensor_model_parallel_world_size(1) + parallel_state.set_tensor_model_parallel_rank(0) + + print("Debug: Creating model") + model = get_model(model_provider, wrap_with_ddp=False) + + print("Debug: Loading checkpoint") + _ = load_checkpoint(model[0], None, None) + model = model[0].cuda() + model.eval() + + print("Loading tokenizer...") + from transformers import LlamaTokenizer + + tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path)) + + print("Starting inference...") + input_ids = tokenizer(args.input, return_tensors="pt").to("cuda") + + with torch.no_grad(): + output = model.generate(input_ids=input_ids["input_ids"], max_new_tokens=100, use_cache=True) + text = tokenizer.decode(output[0], skip_special_tokens=True) + profiler.track("output", text) + print(f"Generated text: {text}") + + profiler.on_stage("complete") + + except Exception as e: + print(f"Debug: Error occurred: {str(e)}") + print(f"Debug: Error type: {type(e)}") + import traceback + + print(f"Debug: Traceback: {traceback.format_exc()}") + profiler.track_error("model", str(e)) + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt new file mode 100644 index 0000000..31b750a --- /dev/null +++ b/dyana/loaders/megatron/requirements.txt @@ -0,0 +1,28 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +# Core dependencies +torch>=2.1.0 +transformers>=4.31.0 +accelerate>=0.21.0 +psutil>=5.6.7 + +# Megatron and model dependencies +ninja +sentencepiece==0.2.0 +tokenizers>=0.13.3 +transformer-engine>=1.3 +einops>=0.6.1 +evaluate +scikit-learn +flash-attn==2.6.1 +hydra_colorlog==1.2.0 +hydra-core==1.3.2 +nltk +datasets + +# Dyana dependencies - using the base requirements +rich>=10.0.0 +pydantic>=2.0.0 +pydantic-yaml>=1.0.0 +docker>=6.0.0 +psutil>=5.6.7 +nvidia-ml-py>=12.0.0 \ No newline at end of file diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml new file mode 100644 index 0000000..2aa2b75 --- /dev/null +++ b/dyana/loaders/megatron/settings.yml @@ -0,0 +1,29 @@ +description: Loads and profiles Megatron-LM DMC models for efficient inference + +build_args: + extra-requirements: EXTRA_REQUIREMENTS + +args: + - name: model + description: Path to Megatron model checkpoint + required: true + volume: true + + - name: tokenizer + description: Path to Llama 2 tokenizer model + required: true + volume: true + + - name: size + description: Model size (7B or 13B) + required: true + choices: ["7B", "13B"] + + - name: input + description: Input text for inference + default: "This is an example prompt." + required: false + +examples: + - description: "Load a Megatron-DMC model with tokenizer:" + command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B From 0d41168172681a6e54baa175457fc31825a0c039 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:28:18 -0500 Subject: [PATCH 02/16] fix: try except err handling in main script --- dyana/loaders/megatron/Dockerfile | 72 +++++++++- dyana/loaders/megatron/main.py | 169 ++++++++++++++++++++++++ dyana/loaders/megatron/requirements.txt | 22 ++- dyana/loaders/megatron/settings.yml | 4 + 4 files changed, 265 insertions(+), 2 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index a487cff..46a1964 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -6,6 +6,7 @@ WORKDIR /app RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ +<<<<<<< Updated upstream && rm -rf /var/lib/apt/lists/* # Clone Megatron-LM dmc branch and install @@ -25,4 +26,73 @@ ENV CUDA_DEVICE_MAX_CONNECTIONS=1 ENV PYTHONUNBUFFERED=1 ENV MEGATRON_DEBUG=1 -ENTRYPOINT ["python3", "-u", "main.py"] \ No newline at end of file +ENTRYPOINT ["python3", "-u", "main.py"] +======= + ca-certificates \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Configure environment +ENV CUDA_HOME=/usr/local/cuda +ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +ENV CUDA_LAUNCH_BLOCKING=1 +ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 +ENV CUDA_MODULE_LOADING=LAZY +ENV TORCH_USE_CUDA_DSA=1 +ENV CUDA_DEVICE_MAX_CONNECTIONS=1 +ENV NCCL_ASYNC_ERROR_HANDLING=1 +ENV OMP_NUM_THREADS=1 +ENV NVTE_FRAMEWORK=pytorch +ENV MAX_JOBS=4 +ENV DEBIAN_FRONTEND=noninteractive +ENV TORCH_CUDNN_V8_API_ENABLED=1 +ENV TORCH_ALLOW_TF32=1 +ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" +ENV PYTORCH_JIT=0 +ENV TORCH_COMPILE_DEBUG=1 +ENV TORCH_INDUCTOR_VAR_NAMES=1 + +# Only verify PyTorch during build +RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" + +# Create working directory +RUN mkdir -p /app/workspace + +# Copy files in correct order +COPY requirements.txt /app/workspace/ +COPY *.py /app/workspace/ +COPY dyana-requirements*.txt /app/workspace/ + +WORKDIR /app/workspace + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Install Megatron-LM +RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git /app/Megatron-LM && \ + cd /app/Megatron-LM && \ + pip install -e . + +ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH + +# Create simpler entrypoint script +RUN printf '#!/bin/bash\n\ + export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\ + export PYTORCH_NO_CUDA_MEMORY_CACHING=1\n\ + exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \ + chmod +x /app/workspace/entrypoint.sh + +# Verify files exist and have correct permissions +RUN ls -la /app/workspace && \ + ls -la /app/workspace/entrypoint.sh && \ + test -x /app/workspace/entrypoint.sh + +# Set proper ownership and permissions +RUN chown -R root:root /app && \ + chmod -R 755 /app && \ + chmod +x /app/workspace/entrypoint.sh + +# Use full path in entrypoint +ENTRYPOINT ["/app/workspace/entrypoint.sh"] +>>>>>>> Stashed changes diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index c98e289..9e20978 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,9 +1,15 @@ +<<<<<<< Updated upstream import os import sys +======= +import argparse +import os +>>>>>>> Stashed changes import torch from pathlib import Path from dyana import Profiler +<<<<<<< Updated upstream from megatron.core import parallel_state from megatron.core.models.gpt import GPTModel from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec @@ -151,3 +157,166 @@ def model_provider(pre_process=True, post_process=True): profiler.track_error("model", str(e)) if torch.cuda.is_available(): torch.cuda.empty_cache() +======= + +def verify_cuda_setup(): + """Verify CUDA and PyTorch setup before model loading""" + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available") + + # Disable JIT/Inductor + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_nvfuser_enabled(False) + + print("=== Runtime Configuration ===") + print(f"PyTorch: {torch.__version__}") + print(f"CUDA: {torch.version.cuda}") + print(f"Device: {torch.cuda.get_device_name()}") + print("===========================") + + # Set default device + torch.cuda.set_device(0) + + +if __name__ == "__main__": + # Initialize profiler first + profiler = Profiler(gpu=True) + + try: + # Verify CUDA setup + verify_cuda_setup() + profiler.on_stage("cuda_verified") + + os.environ["TE_VERBOSE"] = "1" + os.environ["NVTE_FRAMEWORK"] = "pytorch" + print("Starting Megatron loader with verbose logging...") + + # initialize CUDA and Transformer + if torch.cuda.is_available(): + import transformer_engine.pytorch as te + + te.initialize() + print(f"Initialized Transformer Engine version: {te.__version__}") + + # import Megatron dependencies + from megatron.core import parallel_state + from megatron.core.transformer.transformer_config import TransformerConfig + from transformers import LlamaTokenizer + + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--tokenizer", required=True) + parser.add_argument("--size", choices=["7B", "13B"], required=True) + parser.add_argument("--input", default="This is an example prompt.") + args = parser.parse_args() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + try: + print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"CUDA devices: {torch.cuda.device_count()}") + print(f"CUDA version: {torch.version.cuda}") + profiler.track( + "env_info", + { + "te_version": transformer_engine.__version__, + "cuda_devices": torch.cuda.device_count(), + "cuda_version": torch.version.cuda, + }, + ) + + model_path = Path(args.model) + tokenizer_path = Path(args.tokenizer) + if not model_path.exists(): + raise FileNotFoundError(f"Model not found at {model_path}") + if not tokenizer_path.exists(): + raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") + + # Initialize Megatron's tensor parallel + world_size = torch.cuda.device_count() + parallel_state.initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ) + profiler.on_stage("megatron_initialized") + + # Model config based on size + model_config = { + "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32}, + "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, + }[args.size] + + # Create Megatron transformer config + config = TransformerConfig( + num_layers=model_config["num_layers"], + hidden_size=model_config["hidden_size"], + num_attention_heads=model_config["num_attention_heads"], + max_position_embeddings=4096, + init_method_std=0.02, + use_scaled_init_method=True, + attention_softmax_in_fp32=True, + rotary_pct=0.25, # LLaMA uses rotary embeddings + ) + profiler.track("model_config", model_config) + profiler.on_stage("config_created") + + try: + tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) + profiler.on_stage("tokenizer_loaded") + + model = GPTModel( + config=config, + vocab_size=tokenizer.vocab_size, + max_sequence_length=4096, + parallel_output=False, + share_embeddings_and_output_weights=True, + ) + profiler.on_stage("model_created") + + # Load DMC checkpoint + checkpoint = torch.load(str(model_path), map_location=device) + model.load_state_dict(checkpoint) + model.cuda() + model.eval() + profiler.on_stage("model_loaded") + + input_ids = tokenizer(args.input, return_tensors="pt").to(device) + with torch.no_grad(): + output = model(input_ids=input_ids["input_ids"]) + logits = output.logits + next_token = torch.argmax(logits[:, -1, :], dim=-1) + generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1) + text = tokenizer.decode(generated[0], skip_special_tokens=True) + profiler.track("output", text) + profiler.on_stage("inference_complete") + + except Exception as e: + profiler.track_error("model", str(e)) + print(f"Model loading/inference failed: {e}") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + raise + + except Exception as e: + print(f"Error occurred: {str(e)}") + profiler.track_error("model", str(e)) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + raise + + except Exception as e: + profiler.track_error("setup", str(e)) + print(f"Setup error: {e}") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + raise + + finally: + try: + parallel_state.destroy_model_parallel() + except Exception as e: + profiler.track_error("cleanup", str(e)) + print(f"Cleanup error: {e}") +>>>>>>> Stashed changes diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt index 31b750a..7bb4c24 100644 --- a/dyana/loaders/megatron/requirements.txt +++ b/dyana/loaders/megatron/requirements.txt @@ -1,4 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu121 +<<<<<<< Updated upstream # Core dependencies torch>=2.1.0 transformers>=4.31.0 @@ -25,4 +26,23 @@ pydantic>=2.0.0 pydantic-yaml>=1.0.0 docker>=6.0.0 psutil>=5.6.7 -nvidia-ml-py>=12.0.0 \ No newline at end of file +nvidia-ml-py>=12.0.0 +======= +--find-links https://developer.download.nvidia.com/compute/redist + +# Base dependencies from Megatron core +torch>=2.0.0 +packaging>=20.0 +typing_extensions>=4.0.0 + +# Megatron DMC dependencies +flash-attn==2.6.1 +sentencepiece==0.2.0 +hydra-core==1.3.2 +hydra_colorlog==1.2.0 +nltk +datasets + +# Utilities +psutil>=5.6.7 +>>>>>>> Stashed changes diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml index 2aa2b75..770a8d1 100644 --- a/dyana/loaders/megatron/settings.yml +++ b/dyana/loaders/megatron/settings.yml @@ -26,4 +26,8 @@ args: examples: - description: "Load a Megatron-DMC model with tokenizer:" +<<<<<<< Updated upstream command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B +======= + command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B --verbose +>>>>>>> Stashed changes From 1d43a42cbf2536d5489e026c39740944eccd9db1 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:32:14 -0500 Subject: [PATCH 03/16] fix: merge conflicts --- dyana/loaders/megatron/Dockerfile | 26 +--- dyana/loaders/megatron/main.py | 173 ++---------------------- dyana/loaders/megatron/requirements.txt | 32 +---- dyana/loaders/megatron/settings.yml | 4 - 4 files changed, 16 insertions(+), 219 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index 46a1964..80af966 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -6,28 +6,6 @@ WORKDIR /app RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ -<<<<<<< Updated upstream - && rm -rf /var/lib/apt/lists/* - -# Clone Megatron-LM dmc branch and install -RUN git clone -b dmc https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - pip install -e . - -# Copy loader files -COPY . . - -# Install requirements -RUN pip install --no-cache-dir -r dyana-requirements-gpu.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Environment setup -ENV CUDA_DEVICE_MAX_CONNECTIONS=1 -ENV PYTHONUNBUFFERED=1 -ENV MEGATRON_DEBUG=1 - -ENTRYPOINT ["python3", "-u", "main.py"] -======= ca-certificates \ build-essential \ && rm -rf /var/lib/apt/lists/* @@ -46,6 +24,7 @@ ENV OMP_NUM_THREADS=1 ENV NVTE_FRAMEWORK=pytorch ENV MAX_JOBS=4 ENV DEBIAN_FRONTEND=noninteractive +# Add these new environment variables ENV TORCH_CUDNN_V8_API_ENABLED=1 ENV TORCH_ALLOW_TF32=1 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" @@ -94,5 +73,4 @@ RUN chown -R root:root /app && \ chmod +x /app/workspace/entrypoint.sh # Use full path in entrypoint -ENTRYPOINT ["/app/workspace/entrypoint.sh"] ->>>>>>> Stashed changes +ENTRYPOINT ["/app/workspace/entrypoint.sh"] \ No newline at end of file diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 9e20978..5b9b998 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,170 +1,16 @@ -<<<<<<< Updated upstream -import os -import sys -======= import argparse import os ->>>>>>> Stashed changes import torch from pathlib import Path from dyana import Profiler -<<<<<<< Updated upstream -from megatron.core import parallel_state -from megatron.core.models.gpt import GPTModel -from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec -from megatron.training import get_args, get_model -from megatron.training.arguments import parse_args, core_transformer_config_from_args -from megatron.training.initialize import initialize_megatron -from megatron.training.checkpointing import load_checkpoint -from megatron.contrib.dmc import add_dmc_layer - - -def setup_megatron_args(model_size: str, model_path: str, tokenizer_path: str): - """Setup Megatron arguments""" - print("Debug: Starting argument setup") - sys.argv = [sys.argv[0]] - - args = [ - "--tensor-model-parallel-size", - "1", - "--pipeline-model-parallel-size", - "1", - "--load", - model_path, - "--tokenizer-model", - tokenizer_path, - "--tokenizer-type", - "Llama2Tokenizer", - "--bf16", - "--seq-length", - "4096", - "--max-position-embeddings", - "4096", - "--num-layers", - "32" if model_size == "7B" else "40", - "--hidden-size", - "4096" if model_size == "7B" else "5120", - "--num-attention-heads", - "32" if model_size == "7B" else "40", - "--micro-batch-size", - "1", - "--global-batch-size", - "1", - "--no-masked-softmax-fusion", - "--no-load-optim", - "--no-load-rng", - "--skip-train", - "--fp16", - "--use-cpu-initialization", # avoid CUDA deadlocks - "--tokenizer-type", - "Llama2Tokenizer", - ] - - print("Debug: Setting sys.argv") - sys.argv.extend(args) - - print("Debug: Parsing args") - args = parse_args() - - print("Debug: Initializing Megatron") - initialize_megatron(args_defaults={"no_load_optim": True, "no_load_rng": True}) - - return get_args() - - -def model_provider(pre_process=True, post_process=True): - """Model provider for Megatron to load the model.""" - print("Debug: Setting up model provider") - args = get_args() - config = core_transformer_config_from_args(args) - - print("Debug: Creating model") - model = GPTModel( - config=config, - transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(), - vocab_size=args.padded_vocab_size, - max_sequence_length=args.max_position_embeddings, - pre_process=pre_process, - post_process=post_process, - ) - - return model - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True) - parser.add_argument("--tokenizer", required=True) - parser.add_argument("--size", choices=["7B", "13B"], required=True) - parser.add_argument("--input", default="This is an example prompt.") - args = parser.parse_args() - - profiler = Profiler(gpu=True) - - try: - # verify files - model_path = Path(args.model) - tokenizer_path = Path(args.tokenizer) - if not model_path.exists(): - raise FileNotFoundError(f"Model not found at {model_path}") - if not tokenizer_path.exists(): - raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") - - print("Debug: Starting initialization") - profiler.on_stage("initializing") - - print("Debug: Setting up args") - args = setup_megatron_args(args.size, str(model_path), str(tokenizer_path)) - - print("Debug: Initializing model parallel") - torch.cuda.empty_cache() - parallel_state.set_tensor_model_parallel_world_size(1) - parallel_state.set_tensor_model_parallel_rank(0) - - print("Debug: Creating model") - model = get_model(model_provider, wrap_with_ddp=False) - - print("Debug: Loading checkpoint") - _ = load_checkpoint(model[0], None, None) - model = model[0].cuda() - model.eval() - - print("Loading tokenizer...") - from transformers import LlamaTokenizer - - tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path)) - - print("Starting inference...") - input_ids = tokenizer(args.input, return_tensors="pt").to("cuda") - - with torch.no_grad(): - output = model.generate(input_ids=input_ids["input_ids"], max_new_tokens=100, use_cache=True) - text = tokenizer.decode(output[0], skip_special_tokens=True) - profiler.track("output", text) - print(f"Generated text: {text}") - - profiler.on_stage("complete") - - except Exception as e: - print(f"Debug: Error occurred: {str(e)}") - print(f"Debug: Error type: {type(e)}") - import traceback - - print(f"Debug: Traceback: {traceback.format_exc()}") - profiler.track_error("model", str(e)) - if torch.cuda.is_available(): - torch.cuda.empty_cache() -======= def verify_cuda_setup(): """Verify CUDA and PyTorch setup before model loading""" if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available") - # Disable JIT/Inductor + # Disable JIT/Inductor features torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_set_texpr_fuser_enabled(False) @@ -189,18 +35,19 @@ def verify_cuda_setup(): verify_cuda_setup() profiler.on_stage("cuda_verified") + # Enable verbose logging and configure environment os.environ["TE_VERBOSE"] = "1" os.environ["NVTE_FRAMEWORK"] = "pytorch" print("Starting Megatron loader with verbose logging...") - # initialize CUDA and Transformer + # Initialize CUDA and Transformer Engine if torch.cuda.is_available(): import transformer_engine.pytorch as te te.initialize() print(f"Initialized Transformer Engine version: {te.__version__}") - # import Megatron dependencies + # Now import Megatron dependencies from megatron.core import parallel_state from megatron.core.transformer.transformer_config import TransformerConfig from transformers import LlamaTokenizer @@ -215,6 +62,7 @@ def verify_cuda_setup(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: + # Print Megatron environment info print(f"Transformer Engine version: {transformer_engine.__version__}") print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") @@ -227,6 +75,7 @@ def verify_cuda_setup(): }, ) + # Verify files exist model_path = Path(args.model) tokenizer_path = Path(args.tokenizer) if not model_path.exists(): @@ -237,8 +86,8 @@ def verify_cuda_setup(): # Initialize Megatron's tensor parallel world_size = torch.cuda.device_count() parallel_state.initialize_model_parallel( - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, + tensor_model_parallel_size=1, # No tensor parallelism for now + pipeline_model_parallel_size=1, # No pipeline parallelism ) profiler.on_stage("megatron_initialized") @@ -263,9 +112,11 @@ def verify_cuda_setup(): profiler.on_stage("config_created") try: + # Load tokenizer tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) profiler.on_stage("tokenizer_loaded") + # Initialize Megatron model model = GPTModel( config=config, vocab_size=tokenizer.vocab_size, @@ -282,8 +133,10 @@ def verify_cuda_setup(): model.eval() profiler.on_stage("model_loaded") + # Run inference input_ids = tokenizer(args.input, return_tensors="pt").to(device) with torch.no_grad(): + # Megatron expects different input format output = model(input_ids=input_ids["input_ids"]) logits = output.logits next_token = torch.argmax(logits[:, -1, :], dim=-1) @@ -314,9 +167,9 @@ def verify_cuda_setup(): raise finally: + # Clean up Megatron's parallel state try: parallel_state.destroy_model_parallel() except Exception as e: profiler.track_error("cleanup", str(e)) print(f"Cleanup error: {e}") ->>>>>>> Stashed changes diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt index 7bb4c24..fc435c0 100644 --- a/dyana/loaders/megatron/requirements.txt +++ b/dyana/loaders/megatron/requirements.txt @@ -1,33 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cu121 -<<<<<<< Updated upstream -# Core dependencies -torch>=2.1.0 -transformers>=4.31.0 -accelerate>=0.21.0 -psutil>=5.6.7 - -# Megatron and model dependencies -ninja -sentencepiece==0.2.0 -tokenizers>=0.13.3 -transformer-engine>=1.3 -einops>=0.6.1 -evaluate -scikit-learn -flash-attn==2.6.1 -hydra_colorlog==1.2.0 -hydra-core==1.3.2 -nltk -datasets - -# Dyana dependencies - using the base requirements -rich>=10.0.0 -pydantic>=2.0.0 -pydantic-yaml>=1.0.0 -docker>=6.0.0 -psutil>=5.6.7 -nvidia-ml-py>=12.0.0 -======= --find-links https://developer.download.nvidia.com/compute/redist # Base dependencies from Megatron core @@ -44,5 +15,4 @@ nltk datasets # Utilities -psutil>=5.6.7 ->>>>>>> Stashed changes +psutil>=5.6.7 \ No newline at end of file diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml index 770a8d1..2aa2b75 100644 --- a/dyana/loaders/megatron/settings.yml +++ b/dyana/loaders/megatron/settings.yml @@ -26,8 +26,4 @@ args: examples: - description: "Load a Megatron-DMC model with tokenizer:" -<<<<<<< Updated upstream command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B -======= - command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B --verbose ->>>>>>> Stashed changes From 5df5150b01415ac2f5e4c327ac60548de7a45f7b Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:37:29 -0500 Subject: [PATCH 04/16] fix: lint errors --- dyana/loaders/megatron/Dockerfile | 1 - dyana/loaders/megatron/main.py | 23 +++++++---------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index 80af966..bb582a6 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -24,7 +24,6 @@ ENV OMP_NUM_THREADS=1 ENV NVTE_FRAMEWORK=pytorch ENV MAX_JOBS=4 ENV DEBIAN_FRONTEND=noninteractive -# Add these new environment variables ENV TORCH_CUDNN_V8_API_ENABLED=1 ENV TORCH_ALLOW_TF32=1 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 5b9b998..e52f5b1 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,7 +1,11 @@ import argparse import os -import torch from pathlib import Path + +import torch +import transformer_engine as te +from megatron.model.gpt_model import GPTModel + from dyana import Profiler @@ -22,12 +26,10 @@ def verify_cuda_setup(): print(f"Device: {torch.cuda.get_device_name()}") print("===========================") - # Set default device torch.cuda.set_device(0) if __name__ == "__main__": - # Initialize profiler first profiler = Profiler(gpu=True) try: @@ -35,7 +37,6 @@ def verify_cuda_setup(): verify_cuda_setup() profiler.on_stage("cuda_verified") - # Enable verbose logging and configure environment os.environ["TE_VERBOSE"] = "1" os.environ["NVTE_FRAMEWORK"] = "pytorch" print("Starting Megatron loader with verbose logging...") @@ -47,7 +48,6 @@ def verify_cuda_setup(): te.initialize() print(f"Initialized Transformer Engine version: {te.__version__}") - # Now import Megatron dependencies from megatron.core import parallel_state from megatron.core.transformer.transformer_config import TransformerConfig from transformers import LlamaTokenizer @@ -62,20 +62,18 @@ def verify_cuda_setup(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - # Print Megatron environment info - print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"Transformer Engine version: {te.__version__}") print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, + "te_version": te.__version__, "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, ) - # Verify files exist model_path = Path(args.model) tokenizer_path = Path(args.tokenizer) if not model_path.exists(): @@ -97,7 +95,6 @@ def verify_cuda_setup(): "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, }[args.size] - # Create Megatron transformer config config = TransformerConfig( num_layers=model_config["num_layers"], hidden_size=model_config["hidden_size"], @@ -112,11 +109,9 @@ def verify_cuda_setup(): profiler.on_stage("config_created") try: - # Load tokenizer tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) profiler.on_stage("tokenizer_loaded") - # Initialize Megatron model model = GPTModel( config=config, vocab_size=tokenizer.vocab_size, @@ -126,17 +121,14 @@ def verify_cuda_setup(): ) profiler.on_stage("model_created") - # Load DMC checkpoint checkpoint = torch.load(str(model_path), map_location=device) model.load_state_dict(checkpoint) model.cuda() model.eval() profiler.on_stage("model_loaded") - # Run inference input_ids = tokenizer(args.input, return_tensors="pt").to(device) with torch.no_grad(): - # Megatron expects different input format output = model(input_ids=input_ids["input_ids"]) logits = output.logits next_token = torch.argmax(logits[:, -1, :], dim=-1) @@ -167,7 +159,6 @@ def verify_cuda_setup(): raise finally: - # Clean up Megatron's parallel state try: parallel_state.destroy_model_parallel() except Exception as e: From 5885705a40afb260d311db5fb0af85d2dbdd6cfb Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Mon, 3 Feb 2025 21:41:10 -0500 Subject: [PATCH 05/16] fix: import path typechecks --- dyana/loaders/megatron/main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index e52f5b1..f354e52 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -5,11 +5,10 @@ import torch import transformer_engine as te from megatron.model.gpt_model import GPTModel +from dyana.profiler import Profiler -from dyana import Profiler - -def verify_cuda_setup(): +def verify_cuda_setup() -> None: """Verify CUDA and PyTorch setup before model loading""" if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available") From d67d6241eba6b97f7e1bea34e68235c8a2352318 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 08:02:20 -0500 Subject: [PATCH 06/16] fix: lint regression errors --- dyana/loaders/megatron/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index f354e52..7b5c710 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -5,6 +5,7 @@ import torch import transformer_engine as te from megatron.model.gpt_model import GPTModel + from dyana.profiler import Profiler From 4f050e95bb8bdf6d019c11d33e841e9ecb083d3c Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:40:23 -0500 Subject: [PATCH 07/16] chore: reduce complexity in PyTorch JIT/inductor features and os error cleanup --- dyana/loaders/megatron/Dockerfile | 26 ++- dyana/loaders/megatron/main.py | 327 +++++++++++++++++------------- 2 files changed, 208 insertions(+), 145 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index bb582a6..f3ad5f9 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -17,7 +17,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ENV CUDA_LAUNCH_BLOCKING=1 ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 ENV CUDA_MODULE_LOADING=LAZY -ENV TORCH_USE_CUDA_DSA=1 +ENV TORCH_USE_CUDA_DSA=0 ENV CUDA_DEVICE_MAX_CONNECTIONS=1 ENV NCCL_ASYNC_ERROR_HANDLING=1 ENV OMP_NUM_THREADS=1 @@ -28,8 +28,14 @@ ENV TORCH_CUDNN_V8_API_ENABLED=1 ENV TORCH_ALLOW_TF32=1 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" ENV PYTORCH_JIT=0 -ENV TORCH_COMPILE_DEBUG=1 -ENV TORCH_INDUCTOR_VAR_NAMES=1 +ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1 +ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0 +ENV PYTHONFAULTHANDLER=1 +ENV PYTHONUNBUFFERED=1 +ENV NCCL_IB_DISABLE=1 +ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1 +ENV TORCH_SHOW_CPP_STACKTRACES=0 +ENV PYTHONWARNINGS=ignore # Only verify PyTorch during build RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" @@ -54,10 +60,15 @@ RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git / ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH -# Create simpler entrypoint script +# Create directories for IPC +RUN mkdir -p /dev/shm && \ + mkdir -p /tmp/pytorch_extensions && \ + chmod -R 777 /dev/shm /tmp/pytorch_extensions + +# Create simpler entrypoint script with proper environment RUN printf '#!/bin/bash\n\ + # Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\ export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\ - export PYTORCH_NO_CUDA_MEMORY_CACHING=1\n\ exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \ chmod +x /app/workspace/entrypoint.sh @@ -71,5 +82,6 @@ RUN chown -R root:root /app && \ chmod -R 755 /app && \ chmod +x /app/workspace/entrypoint.sh -# Use full path in entrypoint -ENTRYPOINT ["/app/workspace/entrypoint.sh"] \ No newline at end of file +# Use bash as entrypoint shell +SHELL ["/bin/bash", "-c"] +ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""] \ No newline at end of file diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 7b5c710..943d015 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,78 +1,66 @@ -import argparse import os +import sys +import logging +import warnings +import argparse from pathlib import Path +from io import StringIO +import contextlib + +logging.basicConfig(level=logging.ERROR) +warnings.filterwarnings("ignore", category=UserWarning) +os.environ["PYTHONWARNINGS"] = "ignore" + +os.environ.update( + { + "CUDA_LAUNCH_BLOCKING": "1", + "PYTORCH_NO_CUDA_MEMORY_CACHING": "1", + "TORCH_USE_CUDA_DSA": "0", + "NVTE_FRAMEWORK": "pytorch", + "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32", + "TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1", + "TORCH_INDUCTOR_USE_PYTHON_BINDING": "0", + "TORCH_SHOW_CPP_STACKTRACES": "0", + } +) import torch -import transformer_engine as te -from megatron.model.gpt_model import GPTModel - -from dyana.profiler import Profiler - - -def verify_cuda_setup() -> None: - """Verify CUDA and PyTorch setup before model loading""" - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available") - - # Disable JIT/Inductor features - torch._C._jit_override_can_fuse_on_cpu(False) - torch._C._jit_override_can_fuse_on_gpu(False) - torch._C._jit_set_texpr_fuser_enabled(False) - torch._C._jit_set_nvfuser_enabled(False) - - print("=== Runtime Configuration ===") - print(f"PyTorch: {torch.__version__}") - print(f"CUDA: {torch.version.cuda}") - print(f"Device: {torch.cuda.get_device_name()}") - print("===========================") - - torch.cuda.set_device(0) +torch._C._jit_set_nvfuser_enabled(False) +torch._C._jit_set_texpr_fuser_enabled(False) +torch._C._jit_override_can_fuse_on_cpu(False) +torch._C._jit_override_can_fuse_on_gpu(False) if __name__ == "__main__": - profiler = Profiler(gpu=True) - - try: - # Verify CUDA setup - verify_cuda_setup() - profiler.on_stage("cuda_verified") - - os.environ["TE_VERBOSE"] = "1" - os.environ["NVTE_FRAMEWORK"] = "pytorch" - print("Starting Megatron loader with verbose logging...") - - # Initialize CUDA and Transformer Engine - if torch.cuda.is_available(): - import transformer_engine.pytorch as te - - te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") - - from megatron.core import parallel_state - from megatron.core.transformer.transformer_config import TransformerConfig - from transformers import LlamaTokenizer + captured_output = StringIO() + with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output): + try: + from dyana import Profiler - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True) - parser.add_argument("--tokenizer", required=True) - parser.add_argument("--size", choices=["7B", "13B"], required=True) - parser.add_argument("--input", default="This is an example prompt.") - args = parser.parse_args() + profiler = Profiler(gpu=True) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # Initialize CUDA + if torch.cuda.is_available(): + torch.cuda.init() + torch.cuda.set_device(0) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + profiler.track( + "cuda_info", + { + "version": torch.version.cuda, + "device": torch.cuda.get_device_name(), + "device_count": torch.cuda.device_count(), + }, + ) + profiler.on_stage("cuda_initialized") - try: - print(f"Transformer Engine version: {te.__version__}") - print(f"CUDA devices: {torch.cuda.device_count()}") - print(f"CUDA version: {torch.version.cuda}") - profiler.track( - "env_info", - { - "te_version": te.__version__, - "cuda_devices": torch.cuda.device_count(), - "cuda_version": torch.version.cuda, - }, - ) + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--tokenizer", required=True) + parser.add_argument("--size", choices=["7B", "13B"], required=True) + parser.add_argument("--input", default="This is an example prompt.") + args = parser.parse_args() model_path = Path(args.model) tokenizer_path = Path(args.tokenizer) @@ -80,87 +68,150 @@ def verify_cuda_setup() -> None: raise FileNotFoundError(f"Model not found at {model_path}") if not tokenizer_path.exists(): raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") + profiler.on_stage("args_verified") - # Initialize Megatron's tensor parallel - world_size = torch.cuda.device_count() - parallel_state.initialize_model_parallel( - tensor_model_parallel_size=1, # No tensor parallelism for now - pipeline_model_parallel_size=1, # No pipeline parallelism - ) - profiler.on_stage("megatron_initialized") - - # Model config based on size - model_config = { - "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32}, - "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, - }[args.size] - - config = TransformerConfig( - num_layers=model_config["num_layers"], - hidden_size=model_config["hidden_size"], - num_attention_heads=model_config["num_attention_heads"], - max_position_embeddings=4096, - init_method_std=0.02, - use_scaled_init_method=True, - attention_softmax_in_fp32=True, - rotary_pct=0.25, # LLaMA uses rotary embeddings - ) - profiler.track("model_config", model_config) - profiler.on_stage("config_created") + from transformers import LlamaTokenizer + from megatron.core import parallel_state + from megatron.core.transformer.transformer_config import TransformerConfig + + # Initialize profiler first + initialized_parallel = False try: - tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) - profiler.on_stage("tokenizer_loaded") - - model = GPTModel( - config=config, - vocab_size=tokenizer.vocab_size, - max_sequence_length=4096, - parallel_output=False, - share_embeddings_and_output_weights=True, - ) - profiler.on_stage("model_created") - - checkpoint = torch.load(str(model_path), map_location=device) - model.load_state_dict(checkpoint) - model.cuda() - model.eval() - profiler.on_stage("model_loaded") - - input_ids = tokenizer(args.input, return_tensors="pt").to(device) - with torch.no_grad(): - output = model(input_ids=input_ids["input_ids"]) - logits = output.logits - next_token = torch.argmax(logits[:, -1, :], dim=-1) - generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1) - text = tokenizer.decode(generated[0], skip_special_tokens=True) - profiler.track("output", text) - profiler.on_stage("inference_complete") + # Use fork multiprocessing + if sys.platform == "linux": + import torch.multiprocessing as mp + + mp.set_start_method("fork", force=True) + + if torch.cuda.is_available(): + print("=== Runtime Configuration ===") + print(f"PyTorch: {torch.__version__}") + print(f"CUDA: {torch.version.cuda}") + print(f"Device: {torch.cuda.get_device_name()}") + print("===========================") + profiler.on_stage("cuda_verified") + + if torch.cuda.is_available(): + import transformer_engine.pytorch as te + + try: + te.initialize() + print(f"Initialized Transformer Engine version: {te.__version__}") + except Exception as e: + print(f"Warning: Transformer Engine initialization failed: {e}") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + try: + print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"CUDA devices: {torch.cuda.device_count()}") + print(f"CUDA version: {torch.version.cuda}") + profiler.track( + "env_info", + { + "te_version": transformer_engine.__version__, + "cuda_devices": torch.cuda.device_count(), + "cuda_version": torch.version.cuda, + }, + ) + + # Megatron's tensor parallel + world_size = torch.cuda.device_count() + parallel_state.initialize_model_parallel( + tensor_model_parallel_size=1, # No tensor parallelism for now + pipeline_model_parallel_size=1, # No pipeline parallelism + ) + profiler.on_stage("megatron_initialized") + + # parallel state initialization + initialized_parallel = True + + # Model config + model_config = { + "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32}, + "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, + }[args.size] + + # Megatron transformer config + config = TransformerConfig( + num_layers=model_config["num_layers"], + hidden_size=model_config["hidden_size"], + num_attention_heads=model_config["num_attention_heads"], + max_position_embeddings=4096, + init_method_std=0.02, + use_scaled_init_method=True, + attention_softmax_in_fp32=True, + rotary_pct=0.25, # LLaMA uses rotary embeddings + ) + profiler.track("model_config", model_config) + profiler.on_stage("config_created") + + try: + tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) + profiler.on_stage("tokenizer_loaded") + + model = GPTModel( + config=config, + vocab_size=tokenizer.vocab_size, + max_sequence_length=4096, + parallel_output=False, + share_embeddings_and_output_weights=True, + ) + profiler.on_stage("model_created") + + # Load DMC checkpoint + checkpoint = torch.load(str(model_path), map_location=device) + model.load_state_dict(checkpoint) + model.cuda() + model.eval() + profiler.on_stage("model_loaded") + + # Run inference + input_ids = tokenizer(args.input, return_tensors="pt").to(device) + with torch.no_grad(): + output = model(input_ids=input_ids["input_ids"]) + logits = output.logits + next_token = torch.argmax(logits[:, -1, :], dim=-1) + generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1) + text = tokenizer.decode(generated[0], skip_special_tokens=True) + profiler.track("output", text) + profiler.on_stage("inference_complete") + + except Exception as e: + profiler.track_error("model", str(e)) + print(f"Model loading/inference failed: {e}") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + raise + + except Exception as e: + print(f"Error occurred: {str(e)}") + profiler.track_error("model", str(e)) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + raise except Exception as e: - profiler.track_error("model", str(e)) - print(f"Model loading/inference failed: {e}") + profiler.track_error("setup", str(e)) + print(f"Setup error: {e}") if torch.cuda.is_available(): torch.cuda.empty_cache() raise - except Exception as e: - print(f"Error occurred: {str(e)}") - profiler.track_error("model", str(e)) - if torch.cuda.is_available(): - torch.cuda.empty_cache() - raise + finally: + # Clean up Megatron's parallel state only if it was initialized + try: + if initialized_parallel: + parallel_state.destroy_model_parallel() + except Exception as e: + profiler.track_error("cleanup", str(e)) + print(f"Cleanup error: {e}") - except Exception as e: - profiler.track_error("setup", str(e)) - print(f"Setup error: {e}") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - raise - - finally: - try: - parallel_state.destroy_model_parallel() except Exception as e: - profiler.track_error("cleanup", str(e)) - print(f"Cleanup error: {e}") + profiler.track_error("runtime", str(e)) + print(f"Error: {e}", file=sys.stderr) + raise + finally: + profiler.flush() + print(captured_output.getvalue(), file=sys.stderr) From ae99bbac98d8a349f0dff214599e5eb87598b72b Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:47:20 -0500 Subject: [PATCH 08/16] fix: lint typechecks --- dyana/loaders/megatron/main.py | 36 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 943d015..aae02c7 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,16 +1,27 @@ +import argparse +import contextlib +import logging import os import sys -import logging import warnings -import argparse -from pathlib import Path from io import StringIO -import contextlib +from pathlib import Path +import torch +import transformer_engine as te +from megatron.core import parallel_state +from megatron.core.transformer.transformer_config import TransformerConfig +from megatron.model.gpt_model import GPTModel +from transformers import LlamaTokenizer + +from dyana.profiler import Profiler # Update this import path based on your project structure + +# Configure logging and warnings logging.basicConfig(level=logging.ERROR) warnings.filterwarnings("ignore", category=UserWarning) os.environ["PYTHONWARNINGS"] = "ignore" +# Configure environment variables os.environ.update( { "CUDA_LAUNCH_BLOCKING": "1", @@ -24,8 +35,7 @@ } ) -import torch - +# Configure PyTorch torch._C._jit_set_nvfuser_enabled(False) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_override_can_fuse_on_cpu(False) @@ -35,13 +45,11 @@ captured_output = StringIO() with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output): try: - from dyana import Profiler - profiler = Profiler(gpu=True) # Initialize CUDA if torch.cuda.is_available(): - torch.cuda.init() + torch.cuda.init() # type: ignore torch.cuda.set_device(0) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True @@ -70,10 +78,6 @@ raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") profiler.on_stage("args_verified") - from transformers import LlamaTokenizer - from megatron.core import parallel_state - from megatron.core.transformer.transformer_config import TransformerConfig - # Initialize profiler first initialized_parallel = False @@ -93,8 +97,6 @@ profiler.on_stage("cuda_verified") if torch.cuda.is_available(): - import transformer_engine.pytorch as te - try: te.initialize() print(f"Initialized Transformer Engine version: {te.__version__}") @@ -104,13 +106,13 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"Transformer Engine version: {te.__version__}") print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, + "te_version": te.__version__, "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, From cdb89715eb22da69361ae910ce506943af4614e1 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:50:13 -0500 Subject: [PATCH 09/16] fix: revert to working code with lint typecheck errs --- dyana/loaders/megatron/main.py | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index aae02c7..943d015 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,27 +1,16 @@ -import argparse -import contextlib -import logging import os import sys +import logging import warnings -from io import StringIO +import argparse from pathlib import Path +from io import StringIO +import contextlib -import torch -import transformer_engine as te -from megatron.core import parallel_state -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.model.gpt_model import GPTModel -from transformers import LlamaTokenizer - -from dyana.profiler import Profiler # Update this import path based on your project structure - -# Configure logging and warnings logging.basicConfig(level=logging.ERROR) warnings.filterwarnings("ignore", category=UserWarning) os.environ["PYTHONWARNINGS"] = "ignore" -# Configure environment variables os.environ.update( { "CUDA_LAUNCH_BLOCKING": "1", @@ -35,7 +24,8 @@ } ) -# Configure PyTorch +import torch + torch._C._jit_set_nvfuser_enabled(False) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_override_can_fuse_on_cpu(False) @@ -45,11 +35,13 @@ captured_output = StringIO() with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output): try: + from dyana import Profiler + profiler = Profiler(gpu=True) # Initialize CUDA if torch.cuda.is_available(): - torch.cuda.init() # type: ignore + torch.cuda.init() torch.cuda.set_device(0) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True @@ -78,6 +70,10 @@ raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") profiler.on_stage("args_verified") + from transformers import LlamaTokenizer + from megatron.core import parallel_state + from megatron.core.transformer.transformer_config import TransformerConfig + # Initialize profiler first initialized_parallel = False @@ -97,6 +93,8 @@ profiler.on_stage("cuda_verified") if torch.cuda.is_available(): + import transformer_engine.pytorch as te + try: te.initialize() print(f"Initialized Transformer Engine version: {te.__version__}") @@ -106,13 +104,13 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {te.__version__}") + print(f"Transformer Engine version: {transformer_engine.__version__}") print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": te.__version__, + "te_version": transformer_engine.__version__, "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, From 2316938f71767ddc1b31628e7ea01fd5e1339c33 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 12:51:08 -0500 Subject: [PATCH 10/16] chore: add ci check exceptions --- dyana/loaders/megatron/main.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 943d015..441bd63 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,3 +1,5 @@ +# ruff: noqa: I001, E402 +# type: ignore import os import sys import logging @@ -24,7 +26,7 @@ } ) -import torch +import torch # noqa: E402 torch._C._jit_set_nvfuser_enabled(False) torch._C._jit_set_texpr_fuser_enabled(False) @@ -41,7 +43,7 @@ # Initialize CUDA if torch.cuda.is_available(): - torch.cuda.init() + torch.cuda.init() # type: ignore[no-untyped-call] torch.cuda.set_device(0) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True @@ -97,20 +99,20 @@ try: te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") + print(f"Initialized Transformer Engine version: {te.__version__}") # noqa: F821 except Exception as e: print(f"Warning: Transformer Engine initialization failed: {e}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"Transformer Engine version: {transformer_engine.__version__}") # noqa: F821 print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, + "te_version": transformer_engine.__version__, # noqa: F821 "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, @@ -151,7 +153,7 @@ tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) profiler.on_stage("tokenizer_loaded") - model = GPTModel( + model = GPTModel( # noqa: F821 config=config, vocab_size=tokenizer.vocab_size, max_sequence_length=4096, From dc9a344f08aabc1edcb06475e6badbae74859d9e Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:45:14 -0500 Subject: [PATCH 11/16] fix: gpu fixes --- dyana/loaders/megatron/Dockerfile | 25 ++++----- dyana/loaders/megatron/main.py | 84 ++++++++++++++----------------- 2 files changed, 50 insertions(+), 59 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index f3ad5f9..45f48db 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -26,18 +26,15 @@ ENV MAX_JOBS=4 ENV DEBIAN_FRONTEND=noninteractive ENV TORCH_CUDNN_V8_API_ENABLED=1 ENV TORCH_ALLOW_TF32=1 -ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" -ENV PYTORCH_JIT=0 -ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=1 -ENV TORCH_INDUCTOR_USE_PYTHON_BINDING=0 -ENV PYTHONFAULTHANDLER=1 -ENV PYTHONUNBUFFERED=1 -ENV NCCL_IB_DISABLE=1 -ENV PYTORCH_NO_CUDA_MEMORY_CACHING=1 ENV TORCH_SHOW_CPP_STACKTRACES=0 ENV PYTHONWARNINGS=ignore +ENV NVIDIA_VISIBLE_DEVICES="all" +ENV CUDA_DEVICE_ORDER=PCI_BUS_ID +ENV TORCH_USE_CUDA_DSA=1 +ENV PYTORCH_JIT=0 +ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=0 -# Only verify PyTorch during build +# Only verify PyTorch version during build (not CUDA) RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" # Create working directory @@ -65,14 +62,14 @@ RUN mkdir -p /dev/shm && \ mkdir -p /tmp/pytorch_extensions && \ chmod -R 777 /dev/shm /tmp/pytorch_extensions -# Create simpler entrypoint script with proper environment +# Create entrypoint script RUN printf '#!/bin/bash\n\ - # Clear any stale semaphores\nrm -rf /dev/shm/* 2>/dev/null\n\ - export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH\n\ - exec python3 -W ignore main.py "$@"\n' > /app/workspace/entrypoint.sh && \ + python3 -c "import torch; assert torch.cuda.is_available(), \"CUDA is not available\"; device=torch.cuda.get_device_name(); print(f\"CUDA OK: {device}\")" && \ + export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH && \ + exec python3 -W ignore main.py "$@"' > /app/workspace/entrypoint.sh && \ chmod +x /app/workspace/entrypoint.sh -# Verify files exist and have correct permissions +# Verify files exist and perms RUN ls -la /app/workspace && \ ls -la /app/workspace/entrypoint.sh && \ test -x /app/workspace/entrypoint.sh diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 441bd63..1abc1d8 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,5 +1,3 @@ -# ruff: noqa: I001, E402 -# type: ignore import os import sys import logging @@ -11,27 +9,15 @@ logging.basicConfig(level=logging.ERROR) warnings.filterwarnings("ignore", category=UserWarning) -os.environ["PYTHONWARNINGS"] = "ignore" - -os.environ.update( - { - "CUDA_LAUNCH_BLOCKING": "1", - "PYTORCH_NO_CUDA_MEMORY_CACHING": "1", - "TORCH_USE_CUDA_DSA": "0", - "NVTE_FRAMEWORK": "pytorch", - "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32", - "TORCH_INDUCTOR_DISABLE_CUDA_GRAPH": "1", - "TORCH_INDUCTOR_USE_PYTHON_BINDING": "0", - "TORCH_SHOW_CPP_STACKTRACES": "0", - } -) - -import torch # noqa: E402 - -torch._C._jit_set_nvfuser_enabled(False) -torch._C._jit_set_texpr_fuser_enabled(False) -torch._C._jit_override_can_fuse_on_cpu(False) -torch._C._jit_override_can_fuse_on_gpu(False) + +# Import torch and configure CUDA +import torch + +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +if torch.cuda.is_available(): + torch.cuda.init() + torch.cuda.set_device(0) if __name__ == "__main__": captured_output = StringIO() @@ -41,20 +27,28 @@ profiler = Profiler(gpu=True) - # Initialize CUDA - if torch.cuda.is_available(): - torch.cuda.init() # type: ignore[no-untyped-call] - torch.cuda.set_device(0) - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - profiler.track( - "cuda_info", - { - "version": torch.version.cuda, - "device": torch.cuda.get_device_name(), - "device_count": torch.cuda.device_count(), - }, - ) + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available but required") + + # Force CUDA initialization + torch.cuda.init() + torch.cuda.set_device(0) + # Allocate a small tensor to ensure CUDA is working + test_tensor = torch.zeros(1, device="cuda") + del test_tensor + torch.cuda.empty_cache() + + device_name = torch.cuda.get_device_name() + device_count = torch.cuda.device_count() + cuda_version = torch.version.cuda + gpu_mem = torch.cuda.get_device_properties(0).total_memory + print( + f"Found {device_count} CUDA devices, using {device_name} with {gpu_mem / 1e9:.1f}GB memory", + file=sys.stderr, + ) + profiler.track( + "gpu_info", {"device": device_name, "count": device_count, "cuda": cuda_version, "memory": gpu_mem} + ) profiler.on_stage("cuda_initialized") parser = argparse.ArgumentParser() @@ -99,20 +93,20 @@ try: te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") # noqa: F821 + print(f"Initialized Transformer Engine version: {te.__version__}") except Exception as e: print(f"Warning: Transformer Engine initialization failed: {e}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {transformer_engine.__version__}") # noqa: F821 + print(f"Transformer Engine version: {transformer_engine.__version__}") print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, # noqa: F821 + "te_version": transformer_engine.__version__, "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, @@ -153,20 +147,20 @@ tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) profiler.on_stage("tokenizer_loaded") - model = GPTModel( # noqa: F821 + model = GPTModel( config=config, vocab_size=tokenizer.vocab_size, max_sequence_length=4096, parallel_output=False, share_embeddings_and_output_weights=True, - ) + ).cuda() # GPU profiler.on_stage("model_created") - # Load DMC checkpoint - checkpoint = torch.load(str(model_path), map_location=device) + # Load DMC checkpoint directly to GPU + checkpoint = torch.load(str(model_path), map_location="cuda") model.load_state_dict(checkpoint) - model.cuda() model.eval() + torch.cuda.synchronize() # Ensure model is loaded to GPU profiler.on_stage("model_loaded") # Run inference From 9a3f0086b236a91b460b63df348fb3f1fdd91aac Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:58:57 -0500 Subject: [PATCH 12/16] chore: avoid lint n typechecks --- dyana/loaders/megatron/main.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 1abc1d8..616a3c4 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,3 +1,5 @@ +# ruff: noqa: I001, E402, F401, F821 +# type: ignore import os import sys import logging @@ -11,12 +13,12 @@ warnings.filterwarnings("ignore", category=UserWarning) # Import torch and configure CUDA -import torch +import torch # noqa: E402 torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True if torch.cuda.is_available(): - torch.cuda.init() + torch.cuda.init() # type: ignore[no-untyped-call] torch.cuda.set_device(0) if __name__ == "__main__": @@ -31,7 +33,7 @@ raise RuntimeError("CUDA is not available but required") # Force CUDA initialization - torch.cuda.init() + torch.cuda.init() # type: ignore[no-untyped-call] torch.cuda.set_device(0) # Allocate a small tensor to ensure CUDA is working test_tensor = torch.zeros(1, device="cuda") @@ -93,20 +95,20 @@ try: te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") + print(f"Initialized Transformer Engine version: {te.__version__}") # noqa: F821 except Exception as e: print(f"Warning: Transformer Engine initialization failed: {e}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {transformer_engine.__version__}") + print(f"Transformer Engine version: {transformer_engine.__version__}") # noqa: F821 print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, + "te_version": transformer_engine.__version__, # noqa: F821 "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, @@ -147,7 +149,7 @@ tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) profiler.on_stage("tokenizer_loaded") - model = GPTModel( + model = GPTModel( # noqa: F821 config=config, vocab_size=tokenizer.vocab_size, max_sequence_length=4096, From cf9e0f9c770a229bb76e70fb4f9b35b00222a7f3 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 5 Feb 2025 15:30:14 -0500 Subject: [PATCH 13/16] chore: rm explicit tokenizer param --- dyana/loaders/megatron/Dockerfile | 36 ++---- dyana/loaders/megatron/main.py | 159 +++++++++++++++++++++--- dyana/loaders/megatron/requirements.txt | 1 + dyana/loaders/megatron/settings.yml | 11 +- 4 files changed, 159 insertions(+), 48 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index 45f48db..80384b9 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -10,29 +10,19 @@ RUN apt-get update && \ build-essential \ && rm -rf /var/lib/apt/lists/* -# Configure environment -ENV CUDA_HOME=/usr/local/cuda -ENV PATH=/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH -ENV CUDA_LAUNCH_BLOCKING=1 -ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 -ENV CUDA_MODULE_LOADING=LAZY -ENV TORCH_USE_CUDA_DSA=0 -ENV CUDA_DEVICE_MAX_CONNECTIONS=1 -ENV NCCL_ASYNC_ERROR_HANDLING=1 -ENV OMP_NUM_THREADS=1 -ENV NVTE_FRAMEWORK=pytorch -ENV MAX_JOBS=4 -ENV DEBIAN_FRONTEND=noninteractive -ENV TORCH_CUDNN_V8_API_ENABLED=1 -ENV TORCH_ALLOW_TF32=1 -ENV TORCH_SHOW_CPP_STACKTRACES=0 -ENV PYTHONWARNINGS=ignore -ENV NVIDIA_VISIBLE_DEVICES="all" -ENV CUDA_DEVICE_ORDER=PCI_BUS_ID -ENV TORCH_USE_CUDA_DSA=1 -ENV PYTORCH_JIT=0 -ENV TORCH_INDUCTOR_DISABLE_CUDA_GRAPH=0 +# Create required directories for multiprocessing +RUN mkdir -p /dev/shm && \ + mkdir -p /tmp/pytorch_extensions && \ + mkdir -p /run/shm && \ + chmod -R 777 /dev/shm /tmp/pytorch_extensions /run/shm + +# Create ALL required directories for IPC and shared memory +RUN mkdir -p /dev/shm && \ + mkdir -p /run/shm && \ + mkdir -p /tmp/pytorch_extensions && \ + mkdir -p /tmp/.pytorch_jit_cache && \ + mkdir -p /tmp/transformers && \ + chmod -R 777 /dev/shm /run/shm /tmp/pytorch_extensions /tmp/.pytorch_jit_cache /tmp/transformers # Only verify PyTorch version during build (not CUDA) RUN python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 616a3c4..e45d3ad 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,4 +1,4 @@ -# ruff: noqa: I001, E402, F401, F821 +# ruff: noqa: I001, F401, E402, B904, F821 # type: ignore import os import sys @@ -13,33 +13,107 @@ warnings.filterwarnings("ignore", category=UserWarning) # Import torch and configure CUDA -import torch # noqa: E402 +import torch torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True if torch.cuda.is_available(): - torch.cuda.init() # type: ignore[no-untyped-call] + torch.cuda.init() torch.cuda.set_device(0) + +def find_tokenizer(model_path: Path) -> Path: + """Find tokenizer file in model directory or alongside model file.""" + patterns = [ + # LLaMA specific patterns first + "llama*tokenizer*.model", # LLaMA specific naming + "tokenizer.model", # Standard LLaMA tokenizer + # Generic patterns as fallback + "*.model", # sentencepiece models + "tokenizer.*", # huggingface style + "*/tokenizer.*", # nested folder + "vocab.*", # vocabulary files + "merges.txt", # BPE merges + ] + + # Try both the model's directory and its parent directory + search_dirs = [model_path.parent] + if model_path.parent.parent.exists(): + search_dirs.append(model_path.parent.parent) + + print("\n=== Tokenizer Search ===", file=sys.stderr) + + for directory in search_dirs: + print(f"Looking in: {directory}", file=sys.stderr) + print("Directory contents:", file=sys.stderr) + all_files = list(directory.glob("*")) + for f in sorted(all_files): + print(f" {f}", file=sys.stderr) + # If it looks like a LLaMA tokenizer file, try it first + if "tokenizer" in f.name.lower() and f.name.endswith(".model"): + print(f"Found likely LLaMA tokenizer: {f}", file=sys.stderr) + return f + + # If no obvious tokenizer found, try the patterns + print("\nTrying patterns:", file=sys.stderr) + for pattern in patterns: + print(f" {pattern}...", file=sys.stderr, end=" ") + matches = list(directory.glob(pattern)) + if matches: + print(f"Found: {matches[0]}", file=sys.stderr) + return matches[0] + print("No match", file=sys.stderr) + + raise FileNotFoundError( + f"No tokenizer found in {[str(d) for d in search_dirs]} after trying patterns: {patterns}\n" + f"Available files in {model_path.parent}: {[f.name for f in model_path.parent.glob('*')]}" + ) + + if __name__ == "__main__": + # Set multiprocessing start method + import multiprocessing + + multiprocessing.set_start_method("spawn", force=True) + captured_output = StringIO() with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output): try: + print("=== Starting Megatron Loader ===", file=sys.stderr) from dyana import Profiler + # Initialize CUDA + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + os.environ["TORCH_USE_CUDA_DSA"] = "0" + os.environ["PYTORCH_JIT"] = "0" # Disable JIT at env level + os.environ["TORCH_USE_RTLD_GLOBAL"] = "1" + os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1" # Disable CUDA graphs + + if not os.path.exists("/dev/shm"): + print("Warning: /dev/shm not found, creating...", file=sys.stderr) + os.makedirs("/dev/shm", exist_ok=True) + + # PyTorch before other imports + print("=== Configuring PyTorch ===", file=sys.stderr) + # Disable JIT compilation using available methods + if hasattr(torch._C, "_jit_set_profiling_mode"): + torch._C._jit_set_profiling_mode(False) + print("✓ Disabled JIT profiling mode", file=sys.stderr) + profiler = Profiler(gpu=True) if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available but required") # Force CUDA initialization - torch.cuda.init() # type: ignore[no-untyped-call] + torch.cuda.init() torch.cuda.set_device(0) # Allocate a small tensor to ensure CUDA is working test_tensor = torch.zeros(1, device="cuda") del test_tensor torch.cuda.empty_cache() + # GPU info device_name = torch.cuda.get_device_name() device_count = torch.cuda.device_count() cuda_version = torch.version.cuda @@ -53,24 +127,71 @@ ) profiler.on_stage("cuda_initialized") + print("\n=== Importing Dependencies ===", file=sys.stderr) + try: + from transformers import LlamaTokenizer + + print("✓ Imported LlamaTokenizer", file=sys.stderr) + from megatron.core import parallel_state + + print("✓ Imported parallel_state", file=sys.stderr) + from megatron.core.transformer.transformer_config import TransformerConfig + + print("✓ Imported TransformerConfig", file=sys.stderr) + except Exception as e: + print(f"Failed to import dependencies: {e}", file=sys.stderr) + profiler.track_error("imports", str(e)) + raise + + print("\n=== Parsing Arguments ===", file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument("--model", required=True) - parser.add_argument("--tokenizer", required=True) parser.add_argument("--size", choices=["7B", "13B"], required=True) parser.add_argument("--input", default="This is an example prompt.") + parser.add_argument("--tokenizer", help="Optional explicit tokenizer path") args = parser.parse_args() model_path = Path(args.model) - tokenizer_path = Path(args.tokenizer) if not model_path.exists(): raise FileNotFoundError(f"Model not found at {model_path}") - if not tokenizer_path.exists(): - raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") - profiler.on_stage("args_verified") - from transformers import LlamaTokenizer - from megatron.core import parallel_state - from megatron.core.transformer.transformer_config import TransformerConfig + print("\n=== Checking Files ===", file=sys.stderr) + print(f"Model path: {model_path}", file=sys.stderr) + print("Directory contents:", file=sys.stderr) + for f in sorted(model_path.parent.glob("*")): + print(f" {f}", file=sys.stderr) + + # Try explicit tokenizer path + if args.tokenizer: + tokenizer_path = Path(args.tokenizer) + if not tokenizer_path.exists(): + raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") + print(f"Using provided tokenizer: {tokenizer_path}", file=sys.stderr) + else: + # Otherwise search for tokenizer + tokenizer_path = find_tokenizer(model_path) + print(f"Found tokenizer: {tokenizer_path}", file=sys.stderr) + + try: + print("\n=== Loading Tokenizer ===", file=sys.stderr) + print(f"Loading from: {tokenizer_path}", file=sys.stderr) + + try: + tokenizer = LlamaTokenizer.from_pretrained( + str(tokenizer_path.parent), + local_files_only=True, + tokenizer_file=str(tokenizer_path.name), + ) + print(f"Successfully loaded tokenizer (vocab_size={tokenizer.vocab_size})", file=sys.stderr) + except Exception as e: + print(f"Failed to load tokenizer from {tokenizer_path}: {e}", file=sys.stderr) + raise + print("=======================\n", file=sys.stderr) + profiler.on_stage("tokenizer_loaded") + except Exception as e: + print(f"Error loading tokenizer: {e}", file=sys.stderr) + profiler.track_error("tokenizer", str(e)) + raise # Initialize profiler first initialized_parallel = False @@ -95,20 +216,20 @@ try: te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") # noqa: F821 + print(f"Initialized Transformer Engine version: {te.__version__}") except Exception as e: print(f"Warning: Transformer Engine initialization failed: {e}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: - print(f"Transformer Engine version: {transformer_engine.__version__}") # noqa: F821 + print(f"Transformer Engine version: {te.__version__}") # noqa: F821 print(f"CUDA devices: {torch.cuda.device_count()}") print(f"CUDA version: {torch.version.cuda}") profiler.track( "env_info", { - "te_version": transformer_engine.__version__, # noqa: F821 + "te_version": te.__version__, # noqa: F821 "cuda_devices": torch.cuda.device_count(), "cuda_version": torch.version.cuda, }, @@ -146,7 +267,12 @@ profiler.on_stage("config_created") try: + # Load tokenizer + print("\n=== Loading Tokenizer ===", file=sys.stderr) + print(f"Loading from: {tokenizer_path}", file=sys.stderr) tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) + print(f"Loaded tokenizer with vocab size: {tokenizer.vocab_size}", file=sys.stderr) + print("=======================\n", file=sys.stderr) profiler.on_stage("tokenizer_loaded") model = GPTModel( # noqa: F821 @@ -155,7 +281,7 @@ max_sequence_length=4096, parallel_output=False, share_embeddings_and_output_weights=True, - ).cuda() # GPU + ).cuda() # Explicit GPU profiler.on_stage("model_created") # Load DMC checkpoint directly to GPU @@ -198,7 +324,6 @@ raise finally: - # Clean up Megatron's parallel state only if it was initialized try: if initialized_parallel: parallel_state.destroy_model_parallel() diff --git a/dyana/loaders/megatron/requirements.txt b/dyana/loaders/megatron/requirements.txt index fc435c0..cb30dfa 100644 --- a/dyana/loaders/megatron/requirements.txt +++ b/dyana/loaders/megatron/requirements.txt @@ -13,6 +13,7 @@ hydra-core==1.3.2 hydra_colorlog==1.2.0 nltk datasets +transformers>=4.38.0 # Utilities psutil>=5.6.7 \ No newline at end of file diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml index 2aa2b75..db50ae8 100644 --- a/dyana/loaders/megatron/settings.yml +++ b/dyana/loaders/megatron/settings.yml @@ -5,12 +5,7 @@ build_args: args: - name: model - description: Path to Megatron model checkpoint - required: true - volume: true - - - name: tokenizer - description: Path to Llama 2 tokenizer model + description: Path to model checkpoint (tokenizer should be in same directory) required: true volume: true @@ -25,5 +20,5 @@ args: required: false examples: - - description: "Load a Megatron-DMC model with tokenizer:" - command: dyana trace --loader megatron --model /path/to/model --tokenizer /path/to/tokenizer.model --size 7B + - description: "Load a Megatron-DMC model:" + command: dyana trace --loader megatron --model /path/to/model.pt --size 7B From 47155027cddb3b8652035e1d32f3a4d44683a1c8 Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:02:24 -0500 Subject: [PATCH 14/16] fix: missing tokenizer non mandatory --- dyana/loaders/megatron/settings.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml index db50ae8..59dfe7f 100644 --- a/dyana/loaders/megatron/settings.yml +++ b/dyana/loaders/megatron/settings.yml @@ -19,6 +19,13 @@ args: default: "This is an example prompt." required: false + - name: tokenizer + description: Optional explicit path to tokenizer file (otherwise auto-detected) + required: false + volume: true + examples: - - description: "Load a Megatron-DMC model:" + - description: "Load a Megatron-DMC model with auto-detected tokenizer:" command: dyana trace --loader megatron --model /path/to/model.pt --size 7B + - description: "Load model with explicit tokenizer path:" + command: dyana trace --loader megatron --model /path/to/model.pt --size 7B --tokenizer /path/to/tokenizer.model From 420d6ebc484afdf636471d8b0ca2e913c917479d Mon Sep 17 00:00:00 2001 From: Ads Dawson <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:10:43 -0500 Subject: [PATCH 15/16] chore: stop hating on jit --- dyana/loaders/megatron/main.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index e45d3ad..7964df8 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -85,7 +85,6 @@ def find_tokenizer(model_path: Path) -> Path: # Initialize CUDA os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ["TORCH_USE_CUDA_DSA"] = "0" - os.environ["PYTORCH_JIT"] = "0" # Disable JIT at env level os.environ["TORCH_USE_RTLD_GLOBAL"] = "1" os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1" # Disable CUDA graphs @@ -93,13 +92,6 @@ def find_tokenizer(model_path: Path) -> Path: print("Warning: /dev/shm not found, creating...", file=sys.stderr) os.makedirs("/dev/shm", exist_ok=True) - # PyTorch before other imports - print("=== Configuring PyTorch ===", file=sys.stderr) - # Disable JIT compilation using available methods - if hasattr(torch._C, "_jit_set_profiling_mode"): - torch._C._jit_set_profiling_mode(False) - print("✓ Disabled JIT profiling mode", file=sys.stderr) - profiler = Profiler(gpu=True) if not torch.cuda.is_available(): From a37fe39e52e8eb2d890661d31bae9f2c30f66d95 Mon Sep 17 00:00:00 2001 From: evilsocket Date: Thu, 6 Feb 2025 16:45:37 +0100 Subject: [PATCH 16/16] some cleaning --- dyana/loaders/megatron/Dockerfile | 28 +- dyana/loaders/megatron/main.py | 381 ++++++++-------------------- dyana/loaders/megatron/settings.yml | 3 +- 3 files changed, 105 insertions(+), 307 deletions(-) diff --git a/dyana/loaders/megatron/Dockerfile b/dyana/loaders/megatron/Dockerfile index 80384b9..6168a62 100644 --- a/dyana/loaders/megatron/Dockerfile +++ b/dyana/loaders/megatron/Dockerfile @@ -45,30 +45,6 @@ RUN git clone --depth 1 --branch dmc https://github.com/NVIDIA/Megatron-LM.git / cd /app/Megatron-LM && \ pip install -e . -ENV PYTHONPATH=/app/Megatron-LM:$PYTHONPATH +ENV PYTHONPATH=/app/workspac:/app/Megatron-LM:$PYTHONPATH -# Create directories for IPC -RUN mkdir -p /dev/shm && \ - mkdir -p /tmp/pytorch_extensions && \ - chmod -R 777 /dev/shm /tmp/pytorch_extensions - -# Create entrypoint script -RUN printf '#!/bin/bash\n\ - python3 -c "import torch; assert torch.cuda.is_available(), \"CUDA is not available\"; device=torch.cuda.get_device_name(); print(f\"CUDA OK: {device}\")" && \ - export PYTHONPATH=/app/workspace:/app/Megatron-LM:$PYTHONPATH && \ - exec python3 -W ignore main.py "$@"' > /app/workspace/entrypoint.sh && \ - chmod +x /app/workspace/entrypoint.sh - -# Verify files exist and perms -RUN ls -la /app/workspace && \ - ls -la /app/workspace/entrypoint.sh && \ - test -x /app/workspace/entrypoint.sh - -# Set proper ownership and permissions -RUN chown -R root:root /app && \ - chmod -R 755 /app && \ - chmod +x /app/workspace/entrypoint.sh - -# Use bash as entrypoint shell -SHELL ["/bin/bash", "-c"] -ENTRYPOINT ["/bin/bash", "-c", "exec /app/workspace/entrypoint.sh \"$@\""] \ No newline at end of file +ENTRYPOINT ["python3", "-W", "ignore", "main.py"] \ No newline at end of file diff --git a/dyana/loaders/megatron/main.py b/dyana/loaders/megatron/main.py index 7964df8..c0de51c 100644 --- a/dyana/loaders/megatron/main.py +++ b/dyana/loaders/megatron/main.py @@ -1,25 +1,23 @@ -# ruff: noqa: I001, F401, E402, B904, F821 -# type: ignore -import os -import sys +import argparse import logging +import sys import warnings -import argparse +import multiprocessing from pathlib import Path -from io import StringIO -import contextlib logging.basicConfig(level=logging.ERROR) warnings.filterwarnings("ignore", category=UserWarning) -# Import torch and configure CUDA import torch -torch.backends.cuda.matmul.allow_tf32 = True -torch.backends.cudnn.allow_tf32 = True -if torch.cuda.is_available(): - torch.cuda.init() - torch.cuda.set_device(0) +multiprocessing.set_start_method("spawn", force=True) + +import transformer_engine.pytorch as te +from megatron.core import parallel_state +from megatron.core.transformer.transformer_config import TransformerConfig +from transformers import LlamaTokenizer + +from dyana import Profiler def find_tokenizer(model_path: Path) -> Path: @@ -41,28 +39,19 @@ def find_tokenizer(model_path: Path) -> Path: if model_path.parent.parent.exists(): search_dirs.append(model_path.parent.parent) - print("\n=== Tokenizer Search ===", file=sys.stderr) - for directory in search_dirs: - print(f"Looking in: {directory}", file=sys.stderr) - print("Directory contents:", file=sys.stderr) all_files = list(directory.glob("*")) for f in sorted(all_files): print(f" {f}", file=sys.stderr) # If it looks like a LLaMA tokenizer file, try it first if "tokenizer" in f.name.lower() and f.name.endswith(".model"): - print(f"Found likely LLaMA tokenizer: {f}", file=sys.stderr) return f # If no obvious tokenizer found, try the patterns - print("\nTrying patterns:", file=sys.stderr) for pattern in patterns: - print(f" {pattern}...", file=sys.stderr, end=" ") matches = list(directory.glob(pattern)) if matches: - print(f"Found: {matches[0]}", file=sys.stderr) return matches[0] - print("No match", file=sys.stderr) raise FileNotFoundError( f"No tokenizer found in {[str(d) for d in search_dirs]} after trying patterns: {patterns}\n" @@ -70,263 +59,97 @@ def find_tokenizer(model_path: Path) -> Path: ) -if __name__ == "__main__": - # Set multiprocessing start method - import multiprocessing - - multiprocessing.set_start_method("spawn", force=True) - - captured_output = StringIO() - with contextlib.redirect_stdout(captured_output), contextlib.redirect_stderr(captured_output): - try: - print("=== Starting Megatron Loader ===", file=sys.stderr) - from dyana import Profiler - - # Initialize CUDA - os.environ["CUDA_LAUNCH_BLOCKING"] = "1" - os.environ["TORCH_USE_CUDA_DSA"] = "0" - os.environ["TORCH_USE_RTLD_GLOBAL"] = "1" - os.environ["TORCH_INDUCTOR_DISABLE_CUDA_GRAPH"] = "1" # Disable CUDA graphs - - if not os.path.exists("/dev/shm"): - print("Warning: /dev/shm not found, creating...", file=sys.stderr) - os.makedirs("/dev/shm", exist_ok=True) - - profiler = Profiler(gpu=True) - - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available but required") - - # Force CUDA initialization - torch.cuda.init() - torch.cuda.set_device(0) - # Allocate a small tensor to ensure CUDA is working - test_tensor = torch.zeros(1, device="cuda") - del test_tensor - torch.cuda.empty_cache() - - # GPU info - device_name = torch.cuda.get_device_name() - device_count = torch.cuda.device_count() - cuda_version = torch.version.cuda - gpu_mem = torch.cuda.get_device_properties(0).total_memory - print( - f"Found {device_count} CUDA devices, using {device_name} with {gpu_mem / 1e9:.1f}GB memory", - file=sys.stderr, - ) - profiler.track( - "gpu_info", {"device": device_name, "count": device_count, "cuda": cuda_version, "memory": gpu_mem} - ) - profiler.on_stage("cuda_initialized") - - print("\n=== Importing Dependencies ===", file=sys.stderr) - try: - from transformers import LlamaTokenizer - - print("✓ Imported LlamaTokenizer", file=sys.stderr) - from megatron.core import parallel_state - - print("✓ Imported parallel_state", file=sys.stderr) - from megatron.core.transformer.transformer_config import TransformerConfig - - print("✓ Imported TransformerConfig", file=sys.stderr) - except Exception as e: - print(f"Failed to import dependencies: {e}", file=sys.stderr) - profiler.track_error("imports", str(e)) - raise - - print("\n=== Parsing Arguments ===", file=sys.stderr) - parser = argparse.ArgumentParser() - parser.add_argument("--model", required=True) - parser.add_argument("--size", choices=["7B", "13B"], required=True) - parser.add_argument("--input", default="This is an example prompt.") - parser.add_argument("--tokenizer", help="Optional explicit tokenizer path") - args = parser.parse_args() - - model_path = Path(args.model) - if not model_path.exists(): - raise FileNotFoundError(f"Model not found at {model_path}") - - print("\n=== Checking Files ===", file=sys.stderr) - print(f"Model path: {model_path}", file=sys.stderr) - print("Directory contents:", file=sys.stderr) - for f in sorted(model_path.parent.glob("*")): - print(f" {f}", file=sys.stderr) +def load_tokenizer(args) -> LlamaTokenizer: + if args.tokenizer: + tokenizer_path = Path(args.tokenizer) + if not tokenizer_path.exists(): + raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") + else: + # Otherwise search for tokenizer + tokenizer_path = find_tokenizer(model_path) - # Try explicit tokenizer path - if args.tokenizer: - tokenizer_path = Path(args.tokenizer) - if not tokenizer_path.exists(): - raise FileNotFoundError(f"Tokenizer not found at {tokenizer_path}") - print(f"Using provided tokenizer: {tokenizer_path}", file=sys.stderr) - else: - # Otherwise search for tokenizer - tokenizer_path = find_tokenizer(model_path) - print(f"Found tokenizer: {tokenizer_path}", file=sys.stderr) - - try: - print("\n=== Loading Tokenizer ===", file=sys.stderr) - print(f"Loading from: {tokenizer_path}", file=sys.stderr) - - try: - tokenizer = LlamaTokenizer.from_pretrained( - str(tokenizer_path.parent), - local_files_only=True, - tokenizer_file=str(tokenizer_path.name), - ) - print(f"Successfully loaded tokenizer (vocab_size={tokenizer.vocab_size})", file=sys.stderr) - except Exception as e: - print(f"Failed to load tokenizer from {tokenizer_path}: {e}", file=sys.stderr) - raise - print("=======================\n", file=sys.stderr) - profiler.on_stage("tokenizer_loaded") - except Exception as e: - print(f"Error loading tokenizer: {e}", file=sys.stderr) - profiler.track_error("tokenizer", str(e)) - raise - - # Initialize profiler first - initialized_parallel = False - - try: - # Use fork multiprocessing - if sys.platform == "linux": - import torch.multiprocessing as mp - - mp.set_start_method("fork", force=True) - - if torch.cuda.is_available(): - print("=== Runtime Configuration ===") - print(f"PyTorch: {torch.__version__}") - print(f"CUDA: {torch.version.cuda}") - print(f"Device: {torch.cuda.get_device_name()}") - print("===========================") - profiler.on_stage("cuda_verified") - - if torch.cuda.is_available(): - import transformer_engine.pytorch as te - - try: - te.initialize() - print(f"Initialized Transformer Engine version: {te.__version__}") - except Exception as e: - print(f"Warning: Transformer Engine initialization failed: {e}") - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - try: - print(f"Transformer Engine version: {te.__version__}") # noqa: F821 - print(f"CUDA devices: {torch.cuda.device_count()}") - print(f"CUDA version: {torch.version.cuda}") - profiler.track( - "env_info", - { - "te_version": te.__version__, # noqa: F821 - "cuda_devices": torch.cuda.device_count(), - "cuda_version": torch.version.cuda, - }, - ) - - # Megatron's tensor parallel - world_size = torch.cuda.device_count() - parallel_state.initialize_model_parallel( - tensor_model_parallel_size=1, # No tensor parallelism for now - pipeline_model_parallel_size=1, # No pipeline parallelism - ) - profiler.on_stage("megatron_initialized") - - # parallel state initialization - initialized_parallel = True - - # Model config - model_config = { - "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32}, - "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, - }[args.size] - - # Megatron transformer config - config = TransformerConfig( - num_layers=model_config["num_layers"], - hidden_size=model_config["hidden_size"], - num_attention_heads=model_config["num_attention_heads"], - max_position_embeddings=4096, - init_method_std=0.02, - use_scaled_init_method=True, - attention_softmax_in_fp32=True, - rotary_pct=0.25, # LLaMA uses rotary embeddings - ) - profiler.track("model_config", model_config) - profiler.on_stage("config_created") - - try: - # Load tokenizer - print("\n=== Loading Tokenizer ===", file=sys.stderr) - print(f"Loading from: {tokenizer_path}", file=sys.stderr) - tokenizer = LlamaTokenizer.from_pretrained(str(tokenizer_path.parent), local_files_only=True) - print(f"Loaded tokenizer with vocab size: {tokenizer.vocab_size}", file=sys.stderr) - print("=======================\n", file=sys.stderr) - profiler.on_stage("tokenizer_loaded") - - model = GPTModel( # noqa: F821 - config=config, - vocab_size=tokenizer.vocab_size, - max_sequence_length=4096, - parallel_output=False, - share_embeddings_and_output_weights=True, - ).cuda() # Explicit GPU - profiler.on_stage("model_created") - - # Load DMC checkpoint directly to GPU - checkpoint = torch.load(str(model_path), map_location="cuda") - model.load_state_dict(checkpoint) - model.eval() - torch.cuda.synchronize() # Ensure model is loaded to GPU - profiler.on_stage("model_loaded") - - # Run inference - input_ids = tokenizer(args.input, return_tensors="pt").to(device) - with torch.no_grad(): - output = model(input_ids=input_ids["input_ids"]) - logits = output.logits - next_token = torch.argmax(logits[:, -1, :], dim=-1) - generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1) - text = tokenizer.decode(generated[0], skip_special_tokens=True) - profiler.track("output", text) - profiler.on_stage("inference_complete") - - except Exception as e: - profiler.track_error("model", str(e)) - print(f"Model loading/inference failed: {e}") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - raise - - except Exception as e: - print(f"Error occurred: {str(e)}") - profiler.track_error("model", str(e)) - if torch.cuda.is_available(): - torch.cuda.empty_cache() - raise - - except Exception as e: - profiler.track_error("setup", str(e)) - print(f"Setup error: {e}") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - raise + return LlamaTokenizer.from_pretrained( + str(tokenizer_path.parent), + local_files_only=True, + tokenizer_file=str(tokenizer_path.name), + ) - finally: - try: - if initialized_parallel: - parallel_state.destroy_model_parallel() - except Exception as e: - profiler.track_error("cleanup", str(e)) - print(f"Cleanup error: {e}") - except Exception as e: - profiler.track_error("runtime", str(e)) - print(f"Error: {e}", file=sys.stderr) - raise - finally: - profiler.flush() - print(captured_output.getvalue(), file=sys.stderr) +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True) + parser.add_argument("--size", choices=["7B", "13B"], default="7B") + parser.add_argument("--input", default="This is an example prompt.") + parser.add_argument("--tokenizer", help="Optional explicit tokenizer path") + args = parser.parse_args() + + model_config = { + "7B": {"num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32}, + "13B": {"num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40}, + }[args.size] + + profiler = Profiler(gpu=True) + + try: + model_path = Path(args.model) + if not model_path.exists(): + raise FileNotFoundError(f"Model not found at {model_path}") + + tokenizer = load_tokenizer(args) + profiler.on_stage("tokenizer_loaded") + + te.initialize() + + has_gpu = torch.cuda.is_available() + device = torch.device("cuda" if has_gpu else "cpu") + + # Megatron's tensor parallel + parallel_state.initialize_model_parallel( + tensor_model_parallel_size=1, # No tensor parallelism for now + pipeline_model_parallel_size=1, # No pipeline parallelism + ) + profiler.on_stage("megatron_initialized") + + # Megatron transformer config + config = TransformerConfig( + num_layers=model_config["num_layers"], + hidden_size=model_config["hidden_size"], + num_attention_heads=model_config["num_attention_heads"], + max_position_embeddings=4096, + init_method_std=0.02, + use_scaled_init_method=True, + attention_softmax_in_fp32=True, + rotary_pct=0.25, # LLaMA uses rotary embeddings + ) + + model = GPTModel( # noqa: F821 + config=config, + vocab_size=tokenizer.vocab_size, + max_sequence_length=4096, + parallel_output=False, + share_embeddings_and_output_weights=True, + ) + if has_gpu: + model = model.cuda() + + profiler.on_stage("model_created") + + # Load DMC checkpoint directly to GPU + checkpoint = torch.load(str(model_path), map_location=device) + model.load_state_dict(checkpoint) + model.eval() + profiler.on_stage("model_loaded") + + # Run inference + input_ids = tokenizer(args.input, return_tensors="pt").to(device) + with torch.no_grad(): + output = model(input_ids=input_ids["input_ids"]) + logits = output.logits + next_token = torch.argmax(logits[:, -1, :], dim=-1) + generated = torch.cat([input_ids["input_ids"], next_token.unsqueeze(-1)], dim=-1) + text = tokenizer.decode(generated[0], skip_special_tokens=True) + profiler.track("output", text) + profiler.on_stage("inference_complete") + + except Exception as e: + profiler.track_error("megatron", str(e)) diff --git a/dyana/loaders/megatron/settings.yml b/dyana/loaders/megatron/settings.yml index 59dfe7f..608ae21 100644 --- a/dyana/loaders/megatron/settings.yml +++ b/dyana/loaders/megatron/settings.yml @@ -11,8 +11,7 @@ args: - name: size description: Model size (7B or 13B) - required: true - choices: ["7B", "13B"] + required: false - name: input description: Input text for inference