From 7e14f40bfb6711ada2c1adb2a2003b571d7382bf Mon Sep 17 00:00:00 2001 From: Somasundaram Date: Mon, 27 Jan 2025 16:50:47 -0800 Subject: [PATCH] [python] add aot config for nxdi with vllm --- .../setup/djl_python/transformers_neuronx.py | 9 +++++++++ tests/integration/llm/client.py | 4 ++++ tests/integration/llm/prepare.py | 16 ++++++++++++++++ tests/integration/tests.py | 16 ++++++++++++++++ 4 files changed, 45 insertions(+) diff --git a/engines/python/setup/djl_python/transformers_neuronx.py b/engines/python/setup/djl_python/transformers_neuronx.py index 1c7febac0..83e83c258 100644 --- a/engines/python/setup/djl_python/transformers_neuronx.py +++ b/engines/python/setup/djl_python/transformers_neuronx.py @@ -36,6 +36,7 @@ OPTIMUM_CAUSALLM_MODEL_TYPES = {"gpt2", "opt", "bloom", "llama", "mistral"} OPTIMUM_CAUSALLM_CONTINUOUS_BATCHING_MODELS = {"llama", "mistral"} VLLM_CONTINUOUS_BATCHING_MODELS = {"llama"} +NXDI_COMPILED_MODEL_FILE_NAME = "model.pt" class TransformersNeuronXService(object): @@ -141,6 +142,14 @@ def set_model_loader_class(self) -> None: if self.config.model_loader == "nxdi": os.environ[ 'VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference" + if self.config.save_mp_checkpoint_path: + os.environ[ + "NEURON_COMPILED_ARTIFACTS"] = self.config.save_mp_checkpoint_path + nxdi_compiled_model_path = os.path.join( + self.config.model_id_or_path, NXDI_COMPILED_MODEL_FILE_NAME) + if os.path.isfile(nxdi_compiled_model_path): + os.environ[ + "NEURON_COMPILED_ARTIFACTS"] = self.config.model_id_or_path return if self.config.model_loader == "vllm": diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 821f55d12..2728f5325 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -172,6 +172,10 @@ def get_model_name(): "llama-3-1-8b-instruct-vllm-nxdi": { "batch_size": [1, 2], "seq_length": [256], + }, + "llama-3-2-1b-instruct-vllm-nxdi-aot": { + "batch_size": [1], + "seq_length": [128], } } diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 1142a1b7f..14e392001 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -271,6 +271,22 @@ "deterministic": False } } + }, + "llama-3-2-1b-instruct-vllm-nxdi-aot": { + "option.model_id": "s3://djl-llm/llama-3-2-1b-instruct/", + "option.tensor_parallel_degree": 2, + "option.rolling_batch": "vllm", + "option.model_loading_timeout": 1200, + "option.model_loader": "nxdi", + "option.override_neuron_config": { + "on_device_sampling_config": { + "global_topk": 64, + "dynamic": True, + "deterministic": False + } + }, + "option.n_positions": 128, + "option.max_rolling_batch_size": 1, } } diff --git a/tests/integration/tests.py b/tests/integration/tests.py index d25e3b46b..e52f18c21 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -900,6 +900,22 @@ def test_llama_vllm_nxdi(self): "transformers_neuronx_rolling_batch llama-3-1-8b-instruct-vllm-nxdi" ) + def test_llama_vllm_nxdi_aot(self): + with Runner('pytorch-inf2', + 'llama-3-2-1b-instruct-vllm-nxdi-aot') as r: + prepare.build_transformers_neuronx_handler_model( + "llama-3-2-1b-instruct-vllm-nxdi-aot") + r.launch( + container="pytorch-inf2-1", + cmd= + "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot --skip-copy" + ) + r.launch(container="pytorch-inf2-1", + cmd="serve -m test=file:/opt/ml/model/test/aot") + client.run( + "transformers_neuronx_rolling_batch llama-3-2-1b-instruct-vllm-nxdi-aot" + ) + @pytest.mark.correctness @pytest.mark.trtllm