triton-inference-server · AndyDai-nv · Jul 30, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md
@@ -0,0 +1,104 @@
+<!--
+Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Tutorials
+
+## Profile GPT2 running on Triton + vLLM <a id="triton-vllm"></a>
+
+### Run GPT2 on Triton Inference Server using vLLM
+
+<details>
+<summary>See instructions</summary>
+
+Run Triton Inference Server with vLLM backend container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+
+docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3
+
+# Install Triton CLI (~5 min):
+pip install "git+https://github.com/triton-inference-server/[email protected]"
+
+# Download model:
+triton import -m gpt2 --backend vllm
+
+# Run server:
+triton start
+```
+
+</details>
+
+### Run GenAI-Perf
+
+Run GenAI-Perf from Triton Inference Server SDK container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+# Run GenAI-Perf in the container:
+genai-perf profile \
+  -m gpt2 \
+  --service-kind triton \
+  --backend vllm \
+  --num-prompts 100 \
+  --random-seed 123 \
+  --synthetic-input-tokens-mean 200 \
+  --synthetic-input-tokens-stddev 0 \
+  --streaming \
+  --output-tokens-mean 100 \
+  --output-tokens-stddev 0 \
+  --output-tokens-mean-deterministic \
+  --tokenizer hf-internal-testing/llama-tokenizer \
+  --concurrency 4 \
+  --measurement-interval 800 \
+  --profile-export-file my_profile_export.json \
+  --url localhost:8001 \
+  --goodput ttft:10 itl:2
+```
+
+Example output:
+
+```
+                                   LLM Metrics                                    
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+┃                Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+│ Time to first token (ms) │   9.30 │   5.78 │  20.62 │  20.61 │  18.51 │  10.00 │
+│ Inter token latency (ms) │   2.22 │   1.74 │   3.01 │   2.96 │   2.78 │   2.28 │
+│     Request latency (ms) │ 256.65 │ 216.91 │ 345.47 │ 345.46 │ 303.17 │ 267.83 │
+│   Output sequence length │ 112.41 │ 106.00 │ 125.00 │ 123.71 │ 117.70 │ 115.00 │
+│    Input sequence length │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │
+└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
+Output token throughput (per sec): 1751.15
+Request goodput (per sec): 2.83
+Request throughput (per sec): 15.58
+```
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py
@@ -66,6 +66,8 @@ def export(self) -> None:
         # System metrics are printed after the table
         for metric in self._metrics.system_metrics:
             line = metric.name.replace("_", " ").capitalize()
+            if metric.name == "request_goodput" and not self._args.goodput:
+                continue
             value = self._stats[metric.name]["avg"]
             line += f" ({metric.unit}): {value:.2f}"
             print(line)
@@ -105,3 +107,4 @@ def _should_skip(self, metric_name: str) -> bool:
         if not self._args.streaming and metric_name in streaming_metrics:
             return True
         return False
+
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py
@@ -94,6 +94,8 @@ def _write_system_metrics(self, csv_writer) -> None:
         for metric in self._metrics.system_metrics:
             metric_str = metric.name.replace("_", " ").title()
             metric_str += f" ({metric.unit})"
+            if metric.name == "request_goodput" and not self._args.goodput:
+                continue
             value = self._stats[metric.name]["avg"]
             csv_writer.writerow([metric_str, f"{value:.2f}"])
 

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py
@@ -75,3 +75,4 @@ def artifact_dir(self):
     @artifact_dir.setter
     def artifact_dir(self, artifact_dir_value):
         self._artifact_dir = artifact_dir_value
+
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py
@@ -99,6 +99,7 @@ def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> ProfileDataParse
         return LLMProfileDataParser(
             filename=args.profile_export_file,
             tokenizer=tokenizer,
+            goodput_constraints=args.goodput,
         )
 
 

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py
@@ -45,6 +45,7 @@ class LLMMetrics(Metrics):
     LLM_SYSTEM_METRICS = [
         # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec)
         MetricMetadata("output_token_throughput", "per sec"),
+        MetricMetadata("request_goodput", "per sec")
     ]
 
     def __init__(
@@ -58,6 +59,7 @@ def __init__(
         output_sequence_lengths: List[int] = [],
         input_sequence_lengths: List[int] = [],
         chunked_inter_token_latencies: List[List[int]] = [[]],
+        request_goodputs: List[float] = [],
     ) -> None:
         super().__init__(request_throughputs, request_latencies)
         self.time_to_first_tokens = time_to_first_tokens
@@ -66,6 +68,7 @@ def __init__(
         self.output_token_throughputs_per_request = output_token_throughputs_per_request
         self.output_sequence_lengths = output_sequence_lengths
         self.input_sequence_lengths = input_sequence_lengths
+        self.request_goodputs = request_goodputs
 
         # Keeping chunked ITL (old) as a WAR to preserve visualization.
         # Excluded from data.
@@ -80,6 +83,7 @@ def __init__(
         )
         self._base_names["output_sequence_lengths"] = "output_sequence_length"
         self._base_names["input_sequence_lengths"] = "input_sequence_length"
+        self._base_names["request_goodputs"] = "request_goodput"
 
     @property
     def request_metrics(self) -> List[MetricMetadata]:
@@ -105,4 +109,6 @@ def system_metrics(self) -> List[MetricMetadata]:
         # base metrics first and then task specific metrics. Uncomment the below
         # line to enable this order:
         # return base_metrics + self.LLM_SYSTEM_METRICS
+        # Rightnow the goodput will be printed out before throughput if there is
+        # goodput.
         return self.LLM_SYSTEM_METRICS + base_metrics
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py
@@ -131,6 +131,8 @@ def _add_units(self, key) -> None:
             self._stats_dict[key]["unit"] = "ms"
         elif key == "request_throughput":
             self._stats_dict[key]["unit"] = "requests/sec"
+        elif key == "request_goodput":
+            self._stats_dict[key]["unit"] = "requests/sec"
         elif key.startswith("output_token_throughput"):
             self._stats_dict[key]["unit"] = "tokens/sec"
         elif "sequence_length" in key:

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -245,6 +245,24 @@ def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace:
         args.concurrency = 1
     return args
 
+def _check_goodput_args(args):
+    """
+    Parse and check goodput args
+    """
+    if args.goodput:
+        args.goodput = parse_goodput(args.goodput)
+        if 'ttft' not in args.goodput and 'itl' not in args.goodput:
+            raise argparse.ArgumentTypeError(
+                f"Invalid goodput constraints format: {args.goodput}. "
+                "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds."
+            )
+        if 'ttft' not in args.goodput:
+            args.goodput['ttft'] = 1e9
+        if 'itl' not in args.goodput:
+            args.goodput['itl'] = 1e9
+        if args.goodput['ttft'] < 0 or args.goodput['itl'] < 0:
+            raise ValueError("Goodput constraint values must be non-negative.")
+    return args
 
 def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace:
     """
@@ -286,6 +304,18 @@ def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace:
     args.profile_export_file = args.artifact_dir / args.profile_export_file
     return args
 
+def parse_goodput(values):
+    constraints = {}
+    try:
+        for item in values:
+            target_metric, target_val = item.split(':')
+            constraints[target_metric] = float(target_val)
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            f"Invalid goodput constraints format: {values}. "
+            "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds."
+        )
+    return constraints
 
 def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace:
     if args.input_dataset:
@@ -651,6 +681,17 @@ def _add_other_args(parser):
         help="An option to enable verbose mode.",
     )
 
+def _add_goodput_args(parser):
+    goodput_group = parser.add_argument_group("Goodput")
+
+    goodput_group.add_argument(
+        "--goodput",
+        "-g",
+        nargs='+',
+        required=False,
+        help="The goodput constraints are in the format of 'ttft:x itl:y', "
+        "where x and y are numbers in milliseconds."
+    )
 
 def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict:
     request_inputs = {}
@@ -733,6 +774,7 @@ def _parse_profile_args(subparsers) -> argparse.ArgumentParser:
     _add_profile_args(profile)
     _add_output_args(profile)
     _add_other_args(profile)
+    _add_goodput_args(profile)
     profile.set_defaults(func=profile_handler)
     return profile
 
@@ -812,6 +854,7 @@ def refine_args(
         args = _check_image_input_args(parser, args)
         args = _check_load_manager_args(args)
         args = _set_artifact_paths(args)
+        args = _check_goodput_args(args)
     elif args.subcommand == Subcommand.COMPARE.to_lowercase():
         args = _check_compare_args(parser, args)
     else:

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
@@ -69,8 +69,10 @@ def __init__(
         self,
         filename: Path,
         tokenizer: Tokenizer,
+        goodput_constraints: Dict[str, float] = {},
     ) -> None:
         self._tokenizer = tokenizer
+        self._goodput_constraints = goodput_constraints
         super().__init__(filename)
 
     def _parse_requests(self, requests: dict) -> Metrics:
@@ -145,10 +147,18 @@ def _parse_requests(self, requests: dict) -> Metrics:
             chunked_inter_token_latencies.append(chunked_inter_token_latency)
 
         # request & output token throughput
-        benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9  # nanosec
+        benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9  # to seconds
         request_throughputs = [len(requests) / benchmark_duration]
         output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration]
 
+        # request goodput
+        request_goodputs = []
+        if self._goodput_constraints:
+            request_good_count = self._count_good_req(
+                time_to_first_tokens, inter_token_latencies
+            )
+            request_goodputs = [request_good_count / benchmark_duration]
+
         return LLMMetrics(
             request_throughputs,
             request_latencies,
@@ -159,8 +169,21 @@ def _parse_requests(self, requests: dict) -> Metrics:
             output_sequence_lengths,
             input_sequence_lengths,
             chunked_inter_token_latencies,
+            request_goodputs,
         )
-
+
+    def _count_good_req(self, time_to_first_tokens, inter_token_latencies):
+        ttft_constraint_ms = self._goodput_constraints['ttft']
+        itl_constraint_ms = self._goodput_constraints['itl']
+        # ms to ns
+        ttft_constraint = ttft_constraint_ms * 1e6
+        itl_constraint = itl_constraint_ms * 1e6 
+        good_req_count = 0
+        for ttft, itl in zip(time_to_first_tokens, inter_token_latencies):
+            if ttft <= ttft_constraint and itl <= itl_constraint:
+                good_req_count += 1
+        return good_req_count
+
     def _pairwise(self, iterable):
         """Generate pairs of consecutive elements from the given iterable."""
         a, b = tee(iterable)

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
@@ -98,6 +98,7 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
             "image_height_mean",
             "image_height_stddev",
             "image_format",
+            "goodput",
         ]
 
         utils.remove_file(args.profile_export_file)

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py
@@ -263,6 +263,7 @@ def test_generate_json(self, monkeypatch) -> None:
           "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1",
           "tokenizer": "hf-internal-testing/llama-tokenizer",
           "verbose": false,
+          "goodput": null,
           "subcommand": "profile",
           "prompt_source": "synthetic",
           "extra_inputs": {

diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
@@ -71,11 +71,14 @@ def test_llm_metric_system_metrics(self) -> None:
         )
 
         sys_metrics = m.system_metrics
-        assert len(sys_metrics) == 2
+        assert len(sys_metrics) == 3
         assert sys_metrics[0].name == "output_token_throughput"
         assert sys_metrics[0].unit == "per sec"
-        assert sys_metrics[1].name == "request_throughput"
+        assert sys_metrics[1].name == "request_goodput"
         assert sys_metrics[1].unit == "per sec"
+        assert sys_metrics[2].name == "request_throughput"
+        assert sys_metrics[2].unit == "per sec"
+
 
     def test_llm_metrics_get_base_name(self) -> None:
         """Test get_base_name method in LLMMetrics class."""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -75,3 +75,4 @@ def artifact_dir(self):
		@artifact_dir.setter
		def artifact_dir(self, artifact_dir_value):
		self._artifact_dir = artifact_dir_value