diff --git a/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md
new file mode 100644
index 000000000..4b1e56a80
--- /dev/null
+++ b/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md
@@ -0,0 +1,104 @@
+
+
+# Tutorials
+
+## Profile GPT2 running on Triton + vLLM
+
+### Run GPT2 on Triton Inference Server using vLLM
+
+
+See instructions
+
+Run Triton Inference Server with vLLM backend container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+
+docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3
+
+# Install Triton CLI (~5 min):
+pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8"
+
+# Download model:
+triton import -m gpt2 --backend vllm
+
+# Run server:
+triton start
+```
+
+
+
+### Run GenAI-Perf
+
+Run GenAI-Perf from Triton Inference Server SDK container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+# Run GenAI-Perf in the container:
+genai-perf profile \
+ -m gpt2 \
+ --service-kind triton \
+ --backend vllm \
+ --num-prompts 100 \
+ --random-seed 123 \
+ --synthetic-input-tokens-mean 200 \
+ --synthetic-input-tokens-stddev 0 \
+ --streaming \
+ --output-tokens-mean 100 \
+ --output-tokens-stddev 0 \
+ --output-tokens-mean-deterministic \
+ --tokenizer hf-internal-testing/llama-tokenizer \
+ --concurrency 4 \
+ --measurement-interval 800 \
+ --profile-export-file my_profile_export.json \
+ --url localhost:8001 \
+ --goodput ttft:10 itl:2
+```
+
+Example output:
+
+```
+ LLM Metrics
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+│ Time to first token (ms) │ 9.30 │ 5.78 │ 20.62 │ 20.61 │ 18.51 │ 10.00 │
+│ Inter token latency (ms) │ 2.22 │ 1.74 │ 3.01 │ 2.96 │ 2.78 │ 2.28 │
+│ Request latency (ms) │ 256.65 │ 216.91 │ 345.47 │ 345.46 │ 303.17 │ 267.83 │
+│ Output sequence length │ 112.41 │ 106.00 │ 125.00 │ 123.71 │ 117.70 │ 115.00 │
+│ Input sequence length │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │
+└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
+Output token throughput (per sec): 1751.15
+Request goodput (per sec): 2.83
+Request throughput (per sec): 15.58
+```
\ No newline at end of file
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py
index 460fe5976..0e84302a0 100644
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py
@@ -66,6 +66,8 @@ def export(self) -> None:
# System metrics are printed after the table
for metric in self._metrics.system_metrics:
line = metric.name.replace("_", " ").capitalize()
+ if metric.name == "request_goodput" and not self._args.goodput:
+ continue
value = self._stats[metric.name]["avg"]
line += f" ({metric.unit}): {value:.2f}"
print(line)
@@ -105,3 +107,4 @@ def _should_skip(self, metric_name: str) -> bool:
if not self._args.streaming and metric_name in streaming_metrics:
return True
return False
+
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py
index efbb9b754..30c0385b6 100644
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py
@@ -94,6 +94,8 @@ def _write_system_metrics(self, csv_writer) -> None:
for metric in self._metrics.system_metrics:
metric_str = metric.name.replace("_", " ").title()
metric_str += f" ({metric.unit})"
+ if metric.name == "request_goodput" and not self._args.goodput:
+ continue
value = self._stats[metric.name]["avg"]
csv_writer.writerow([metric_str, f"{value:.2f}"])
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py
index 0d9c7cd0b..7ff66b657 100644
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py
@@ -75,3 +75,4 @@ def artifact_dir(self):
@artifact_dir.setter
def artifact_dir(self, artifact_dir_value):
self._artifact_dir = artifact_dir_value
+
\ No newline at end of file
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py
index 9ff7b5b9a..b7d7d1b13 100755
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py
@@ -99,6 +99,7 @@ def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> ProfileDataParse
return LLMProfileDataParser(
filename=args.profile_export_file,
tokenizer=tokenizer,
+ goodput_constraints=args.goodput,
)
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py
index 13dff8a63..7951fb0db 100755
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py
@@ -45,6 +45,7 @@ class LLMMetrics(Metrics):
LLM_SYSTEM_METRICS = [
# (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec)
MetricMetadata("output_token_throughput", "per sec"),
+ MetricMetadata("request_goodput", "per sec")
]
def __init__(
@@ -58,6 +59,7 @@ def __init__(
output_sequence_lengths: List[int] = [],
input_sequence_lengths: List[int] = [],
chunked_inter_token_latencies: List[List[int]] = [[]],
+ request_goodputs: List[float] = [],
) -> None:
super().__init__(request_throughputs, request_latencies)
self.time_to_first_tokens = time_to_first_tokens
@@ -66,6 +68,7 @@ def __init__(
self.output_token_throughputs_per_request = output_token_throughputs_per_request
self.output_sequence_lengths = output_sequence_lengths
self.input_sequence_lengths = input_sequence_lengths
+ self.request_goodputs = request_goodputs
# Keeping chunked ITL (old) as a WAR to preserve visualization.
# Excluded from data.
@@ -80,6 +83,7 @@ def __init__(
)
self._base_names["output_sequence_lengths"] = "output_sequence_length"
self._base_names["input_sequence_lengths"] = "input_sequence_length"
+ self._base_names["request_goodputs"] = "request_goodput"
@property
def request_metrics(self) -> List[MetricMetadata]:
@@ -105,4 +109,6 @@ def system_metrics(self) -> List[MetricMetadata]:
# base metrics first and then task specific metrics. Uncomment the below
# line to enable this order:
# return base_metrics + self.LLM_SYSTEM_METRICS
+ # Rightnow the goodput will be printed out before throughput if there is
+ # goodput.
return self.LLM_SYSTEM_METRICS + base_metrics
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py
index f0d12cef6..8ba37a8ad 100755
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py
@@ -131,6 +131,8 @@ def _add_units(self, key) -> None:
self._stats_dict[key]["unit"] = "ms"
elif key == "request_throughput":
self._stats_dict[key]["unit"] = "requests/sec"
+ elif key == "request_goodput":
+ self._stats_dict[key]["unit"] = "requests/sec"
elif key.startswith("output_token_throughput"):
self._stats_dict[key]["unit"] = "tokens/sec"
elif "sequence_length" in key:
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
index 776535d15..beec0bea6 100644
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py
@@ -245,6 +245,24 @@ def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace:
args.concurrency = 1
return args
+def _check_goodput_args(args):
+ """
+ Parse and check goodput args
+ """
+ if args.goodput:
+ args.goodput = parse_goodput(args.goodput)
+ if 'ttft' not in args.goodput and 'itl' not in args.goodput:
+ raise argparse.ArgumentTypeError(
+ f"Invalid goodput constraints format: {args.goodput}. "
+ "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds."
+ )
+ if 'ttft' not in args.goodput:
+ args.goodput['ttft'] = 1e9
+ if 'itl' not in args.goodput:
+ args.goodput['itl'] = 1e9
+ if args.goodput['ttft'] < 0 or args.goodput['itl'] < 0:
+ raise ValueError("Goodput constraint values must be non-negative.")
+ return args
def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace:
"""
@@ -286,6 +304,18 @@ def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace:
args.profile_export_file = args.artifact_dir / args.profile_export_file
return args
+def parse_goodput(values):
+ constraints = {}
+ try:
+ for item in values:
+ target_metric, target_val = item.split(':')
+ constraints[target_metric] = float(target_val)
+ except ValueError:
+ raise argparse.ArgumentTypeError(
+ f"Invalid goodput constraints format: {values}. "
+ "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds."
+ )
+ return constraints
def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace:
if args.input_dataset:
@@ -651,6 +681,17 @@ def _add_other_args(parser):
help="An option to enable verbose mode.",
)
+def _add_goodput_args(parser):
+ goodput_group = parser.add_argument_group("Goodput")
+
+ goodput_group.add_argument(
+ "--goodput",
+ "-g",
+ nargs='+',
+ required=False,
+ help="The goodput constraints are in the format of 'ttft:x itl:y', "
+ "where x and y are numbers in milliseconds."
+ )
def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict:
request_inputs = {}
@@ -733,6 +774,7 @@ def _parse_profile_args(subparsers) -> argparse.ArgumentParser:
_add_profile_args(profile)
_add_output_args(profile)
_add_other_args(profile)
+ _add_goodput_args(profile)
profile.set_defaults(func=profile_handler)
return profile
@@ -812,6 +854,7 @@ def refine_args(
args = _check_image_input_args(parser, args)
args = _check_load_manager_args(args)
args = _set_artifact_paths(args)
+ args = _check_goodput_args(args)
elif args.subcommand == Subcommand.COMPARE.to_lowercase():
args = _check_compare_args(parser, args)
else:
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
index 183f21fd2..8d59e0c77 100755
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
@@ -69,8 +69,10 @@ def __init__(
self,
filename: Path,
tokenizer: Tokenizer,
+ goodput_constraints: Dict[str, float] = {},
) -> None:
self._tokenizer = tokenizer
+ self._goodput_constraints = goodput_constraints
super().__init__(filename)
def _parse_requests(self, requests: dict) -> Metrics:
@@ -145,10 +147,18 @@ def _parse_requests(self, requests: dict) -> Metrics:
chunked_inter_token_latencies.append(chunked_inter_token_latency)
# request & output token throughput
- benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec
+ benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # to seconds
request_throughputs = [len(requests) / benchmark_duration]
output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration]
+ # request goodput
+ request_goodputs = []
+ if self._goodput_constraints:
+ request_good_count = self._count_good_req(
+ time_to_first_tokens, inter_token_latencies
+ )
+ request_goodputs = [request_good_count / benchmark_duration]
+
return LLMMetrics(
request_throughputs,
request_latencies,
@@ -159,8 +169,21 @@ def _parse_requests(self, requests: dict) -> Metrics:
output_sequence_lengths,
input_sequence_lengths,
chunked_inter_token_latencies,
+ request_goodputs,
)
-
+
+ def _count_good_req(self, time_to_first_tokens, inter_token_latencies):
+ ttft_constraint_ms = self._goodput_constraints['ttft']
+ itl_constraint_ms = self._goodput_constraints['itl']
+ # ms to ns
+ ttft_constraint = ttft_constraint_ms * 1e6
+ itl_constraint = itl_constraint_ms * 1e6
+ good_req_count = 0
+ for ttft, itl in zip(time_to_first_tokens, inter_token_latencies):
+ if ttft <= ttft_constraint and itl <= itl_constraint:
+ good_req_count += 1
+ return good_req_count
+
def _pairwise(self, iterable):
"""Generate pairs of consecutive elements from the given iterable."""
a, b = tee(iterable)
diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
index 76ef3e321..0ad4f0d3b 100644
--- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
+++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
@@ -98,6 +98,7 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
"image_height_mean",
"image_height_stddev",
"image_format",
+ "goodput",
]
utils.remove_file(args.profile_export_file)
diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py
index f82e59312..5cb8c138c 100644
--- a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py
+++ b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py
@@ -263,6 +263,7 @@ def test_generate_json(self, monkeypatch) -> None:
"artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1",
"tokenizer": "hf-internal-testing/llama-tokenizer",
"verbose": false,
+ "goodput": null,
"subcommand": "profile",
"prompt_source": "synthetic",
"extra_inputs": {
diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
index 689e366cd..8a90f37a8 100644
--- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py
@@ -71,11 +71,14 @@ def test_llm_metric_system_metrics(self) -> None:
)
sys_metrics = m.system_metrics
- assert len(sys_metrics) == 2
+ assert len(sys_metrics) == 3
assert sys_metrics[0].name == "output_token_throughput"
assert sys_metrics[0].unit == "per sec"
- assert sys_metrics[1].name == "request_throughput"
+ assert sys_metrics[1].name == "request_goodput"
assert sys_metrics[1].unit == "per sec"
+ assert sys_metrics[2].name == "request_throughput"
+ assert sys_metrics[2].unit == "per sec"
+
def test_llm_metrics_get_base_name(self) -> None:
"""Test get_base_name method in LLMMetrics class."""