From 7e14f40bfb6711ada2c1adb2a2003b571d7382bf Mon Sep 17 00:00:00 2001
From: Somasundaram <somasundaram.sindhu@gmail.com>
Date: Mon, 27 Jan 2025 16:50:47 -0800
Subject: [PATCH] [python] add aot config for nxdi with vllm

---
 .../setup/djl_python/transformers_neuronx.py     |  9 +++++++++
 tests/integration/llm/client.py                  |  4 ++++
 tests/integration/llm/prepare.py                 | 16 ++++++++++++++++
 tests/integration/tests.py                       | 16 ++++++++++++++++
 4 files changed, 45 insertions(+)

diff --git a/engines/python/setup/djl_python/transformers_neuronx.py b/engines/python/setup/djl_python/transformers_neuronx.py
index 1c7febac0..83e83c258 100644
--- a/engines/python/setup/djl_python/transformers_neuronx.py
+++ b/engines/python/setup/djl_python/transformers_neuronx.py
@@ -36,6 +36,7 @@
 OPTIMUM_CAUSALLM_MODEL_TYPES = {"gpt2", "opt", "bloom", "llama", "mistral"}
 OPTIMUM_CAUSALLM_CONTINUOUS_BATCHING_MODELS = {"llama", "mistral"}
 VLLM_CONTINUOUS_BATCHING_MODELS = {"llama"}
+NXDI_COMPILED_MODEL_FILE_NAME = "model.pt"
 
 
 class TransformersNeuronXService(object):
@@ -141,6 +142,14 @@ def set_model_loader_class(self) -> None:
         if self.config.model_loader == "nxdi":
             os.environ[
                 'VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
+            if self.config.save_mp_checkpoint_path:
+                os.environ[
+                    "NEURON_COMPILED_ARTIFACTS"] = self.config.save_mp_checkpoint_path
+            nxdi_compiled_model_path = os.path.join(
+                self.config.model_id_or_path, NXDI_COMPILED_MODEL_FILE_NAME)
+            if os.path.isfile(nxdi_compiled_model_path):
+                os.environ[
+                    "NEURON_COMPILED_ARTIFACTS"] = self.config.model_id_or_path
             return
 
         if self.config.model_loader == "vllm":
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 821f55d12..2728f5325 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -172,6 +172,10 @@ def get_model_name():
     "llama-3-1-8b-instruct-vllm-nxdi": {
         "batch_size": [1, 2],
         "seq_length": [256],
+    },
+    "llama-3-2-1b-instruct-vllm-nxdi-aot": {
+        "batch_size": [1],
+        "seq_length": [128],
     }
 }
 
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index 1142a1b7f..14e392001 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -271,6 +271,22 @@
                 "deterministic": False
             }
         }
+    },
+    "llama-3-2-1b-instruct-vllm-nxdi-aot": {
+        "option.model_id": "s3://djl-llm/llama-3-2-1b-instruct/",
+        "option.tensor_parallel_degree": 2,
+        "option.rolling_batch": "vllm",
+        "option.model_loading_timeout": 1200,
+        "option.model_loader": "nxdi",
+        "option.override_neuron_config": {
+            "on_device_sampling_config": {
+                "global_topk": 64,
+                "dynamic": True,
+                "deterministic": False
+            }
+        },
+        "option.n_positions": 128,
+        "option.max_rolling_batch_size": 1,
     }
 }
 
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index d25e3b46b..e52f18c21 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -900,6 +900,22 @@ def test_llama_vllm_nxdi(self):
                 "transformers_neuronx_rolling_batch llama-3-1-8b-instruct-vllm-nxdi"
             )
 
+    def test_llama_vllm_nxdi_aot(self):
+        with Runner('pytorch-inf2',
+                    'llama-3-2-1b-instruct-vllm-nxdi-aot') as r:
+            prepare.build_transformers_neuronx_handler_model(
+                "llama-3-2-1b-instruct-vllm-nxdi-aot")
+            r.launch(
+                container="pytorch-inf2-1",
+                cmd=
+                "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot --skip-copy"
+            )
+            r.launch(container="pytorch-inf2-1",
+                     cmd="serve -m test=file:/opt/ml/model/test/aot")
+            client.run(
+                "transformers_neuronx_rolling_batch llama-3-2-1b-instruct-vllm-nxdi-aot"
+            )
+
 
 @pytest.mark.correctness
 @pytest.mark.trtllm