diff --git a/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md b/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md new file mode 100644 index 000000000..4b1e56a80 --- /dev/null +++ b/src/c++/perf_analyzer/genai-perf/docs/goodput_tutorial.md @@ -0,0 +1,104 @@ + + +# Tutorials + +## Profile GPT2 running on Triton + vLLM + +### Run GPT2 on Triton Inference Server using vLLM + +
+See instructions + +Run Triton Inference Server with vLLM backend container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" + + +docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 + +# Install Triton CLI (~5 min): +pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" + +# Download model: +triton import -m gpt2 --backend vllm + +# Run server: +triton start +``` + +
+ +### Run GenAI-Perf + +Run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" + +docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind triton \ + --backend vllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 4 \ + --measurement-interval 800 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 \ + --goodput ttft:10 itl:2 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Time to first token (ms) │ 9.30 │ 5.78 │ 20.62 │ 20.61 │ 18.51 │ 10.00 │ +│ Inter token latency (ms) │ 2.22 │ 1.74 │ 3.01 │ 2.96 │ 2.78 │ 2.28 │ +│ Request latency (ms) │ 256.65 │ 216.91 │ 345.47 │ 345.46 │ 303.17 │ 267.83 │ +│ Output sequence length │ 112.41 │ 106.00 │ 125.00 │ 123.71 │ 117.70 │ 115.00 │ +│ Input sequence length │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ +└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 1751.15 +Request goodput (per sec): 2.83 +Request throughput (per sec): 15.58 +``` \ No newline at end of file diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py index 460fe5976..0e84302a0 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/console_exporter.py @@ -66,6 +66,8 @@ def export(self) -> None: # System metrics are printed after the table for metric in self._metrics.system_metrics: line = metric.name.replace("_", " ").capitalize() + if metric.name == "request_goodput" and not self._args.goodput: + continue value = self._stats[metric.name]["avg"] line += f" ({metric.unit}): {value:.2f}" print(line) @@ -105,3 +107,4 @@ def _should_skip(self, metric_name: str) -> bool: if not self._args.streaming and metric_name in streaming_metrics: return True return False + diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py index efbb9b754..30c0385b6 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/csv_exporter.py @@ -94,6 +94,8 @@ def _write_system_metrics(self, csv_writer) -> None: for metric in self._metrics.system_metrics: metric_str = metric.name.replace("_", " ").title() metric_str += f" ({metric.unit})" + if metric.name == "request_goodput" and not self._args.goodput: + continue value = self._stats[metric.name]["avg"] csv_writer.writerow([metric_str, f"{value:.2f}"]) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py index 0d9c7cd0b..7ff66b657 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/export_data/exporter_config.py @@ -75,3 +75,4 @@ def artifact_dir(self): @artifact_dir.setter def artifact_dir(self, artifact_dir_value): self._artifact_dir = artifact_dir_value + \ No newline at end of file diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py index 9ff7b5b9a..b7d7d1b13 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py @@ -99,6 +99,7 @@ def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> ProfileDataParse return LLMProfileDataParser( filename=args.profile_export_file, tokenizer=tokenizer, + goodput_constraints=args.goodput, ) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py index 13dff8a63..7951fb0db 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/llm_metrics.py @@ -45,6 +45,7 @@ class LLMMetrics(Metrics): LLM_SYSTEM_METRICS = [ # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec) MetricMetadata("output_token_throughput", "per sec"), + MetricMetadata("request_goodput", "per sec") ] def __init__( @@ -58,6 +59,7 @@ def __init__( output_sequence_lengths: List[int] = [], input_sequence_lengths: List[int] = [], chunked_inter_token_latencies: List[List[int]] = [[]], + request_goodputs: List[float] = [], ) -> None: super().__init__(request_throughputs, request_latencies) self.time_to_first_tokens = time_to_first_tokens @@ -66,6 +68,7 @@ def __init__( self.output_token_throughputs_per_request = output_token_throughputs_per_request self.output_sequence_lengths = output_sequence_lengths self.input_sequence_lengths = input_sequence_lengths + self.request_goodputs = request_goodputs # Keeping chunked ITL (old) as a WAR to preserve visualization. # Excluded from data. @@ -80,6 +83,7 @@ def __init__( ) self._base_names["output_sequence_lengths"] = "output_sequence_length" self._base_names["input_sequence_lengths"] = "input_sequence_length" + self._base_names["request_goodputs"] = "request_goodput" @property def request_metrics(self) -> List[MetricMetadata]: @@ -105,4 +109,6 @@ def system_metrics(self) -> List[MetricMetadata]: # base metrics first and then task specific metrics. Uncomment the below # line to enable this order: # return base_metrics + self.LLM_SYSTEM_METRICS + # Rightnow the goodput will be printed out before throughput if there is + # goodput. return self.LLM_SYSTEM_METRICS + base_metrics diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py index f0d12cef6..8ba37a8ad 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/metrics/statistics.py @@ -131,6 +131,8 @@ def _add_units(self, key) -> None: self._stats_dict[key]["unit"] = "ms" elif key == "request_throughput": self._stats_dict[key]["unit"] = "requests/sec" + elif key == "request_goodput": + self._stats_dict[key]["unit"] = "requests/sec" elif key.startswith("output_token_throughput"): self._stats_dict[key]["unit"] = "tokens/sec" elif "sequence_length" in key: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index 776535d15..beec0bea6 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -245,6 +245,24 @@ def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace: args.concurrency = 1 return args +def _check_goodput_args(args): + """ + Parse and check goodput args + """ + if args.goodput: + args.goodput = parse_goodput(args.goodput) + if 'ttft' not in args.goodput and 'itl' not in args.goodput: + raise argparse.ArgumentTypeError( + f"Invalid goodput constraints format: {args.goodput}. " + "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds." + ) + if 'ttft' not in args.goodput: + args.goodput['ttft'] = 1e9 + if 'itl' not in args.goodput: + args.goodput['itl'] = 1e9 + if args.goodput['ttft'] < 0 or args.goodput['itl'] < 0: + raise ValueError("Goodput constraint values must be non-negative.") + return args def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace: """ @@ -286,6 +304,18 @@ def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace: args.profile_export_file = args.artifact_dir / args.profile_export_file return args +def parse_goodput(values): + constraints = {} + try: + for item in values: + target_metric, target_val = item.split(':') + constraints[target_metric] = float(target_val) + except ValueError: + raise argparse.ArgumentTypeError( + f"Invalid goodput constraints format: {values}. " + "Expected format is 'ttft:x itl:y', where x and y are numbers in milliseconds." + ) + return constraints def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace: if args.input_dataset: @@ -651,6 +681,17 @@ def _add_other_args(parser): help="An option to enable verbose mode.", ) +def _add_goodput_args(parser): + goodput_group = parser.add_argument_group("Goodput") + + goodput_group.add_argument( + "--goodput", + "-g", + nargs='+', + required=False, + help="The goodput constraints are in the format of 'ttft:x itl:y', " + "where x and y are numbers in milliseconds." + ) def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: request_inputs = {} @@ -733,6 +774,7 @@ def _parse_profile_args(subparsers) -> argparse.ArgumentParser: _add_profile_args(profile) _add_output_args(profile) _add_other_args(profile) + _add_goodput_args(profile) profile.set_defaults(func=profile_handler) return profile @@ -812,6 +854,7 @@ def refine_args( args = _check_image_input_args(parser, args) args = _check_load_manager_args(args) args = _set_artifact_paths(args) + args = _check_goodput_args(args) elif args.subcommand == Subcommand.COMPARE.to_lowercase(): args = _check_compare_args(parser, args) else: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py index 183f21fd2..8d59e0c77 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -69,8 +69,10 @@ def __init__( self, filename: Path, tokenizer: Tokenizer, + goodput_constraints: Dict[str, float] = {}, ) -> None: self._tokenizer = tokenizer + self._goodput_constraints = goodput_constraints super().__init__(filename) def _parse_requests(self, requests: dict) -> Metrics: @@ -145,10 +147,18 @@ def _parse_requests(self, requests: dict) -> Metrics: chunked_inter_token_latencies.append(chunked_inter_token_latency) # request & output token throughput - benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # to seconds request_throughputs = [len(requests) / benchmark_duration] output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration] + # request goodput + request_goodputs = [] + if self._goodput_constraints: + request_good_count = self._count_good_req( + time_to_first_tokens, inter_token_latencies + ) + request_goodputs = [request_good_count / benchmark_duration] + return LLMMetrics( request_throughputs, request_latencies, @@ -159,8 +169,21 @@ def _parse_requests(self, requests: dict) -> Metrics: output_sequence_lengths, input_sequence_lengths, chunked_inter_token_latencies, + request_goodputs, ) - + + def _count_good_req(self, time_to_first_tokens, inter_token_latencies): + ttft_constraint_ms = self._goodput_constraints['ttft'] + itl_constraint_ms = self._goodput_constraints['itl'] + # ms to ns + ttft_constraint = ttft_constraint_ms * 1e6 + itl_constraint = itl_constraint_ms * 1e6 + good_req_count = 0 + for ttft, itl in zip(time_to_first_tokens, inter_token_latencies): + if ttft <= ttft_constraint and itl <= itl_constraint: + good_req_count += 1 + return good_req_count + def _pairwise(self, iterable): """Generate pairs of consecutive elements from the given iterable.""" a, b = tee(iterable) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index 76ef3e321..0ad4f0d3b 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -98,6 +98,7 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s "image_height_mean", "image_height_stddev", "image_format", + "goodput", ] utils.remove_file(args.profile_export_file) diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py index f82e59312..5cb8c138c 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_json_exporter.py @@ -263,6 +263,7 @@ def test_generate_json(self, monkeypatch) -> None: "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1", "tokenizer": "hf-internal-testing/llama-tokenizer", "verbose": false, + "goodput": null, "subcommand": "profile", "prompt_source": "synthetic", "extra_inputs": { diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py index 689e366cd..8a90f37a8 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_metrics.py @@ -71,11 +71,14 @@ def test_llm_metric_system_metrics(self) -> None: ) sys_metrics = m.system_metrics - assert len(sys_metrics) == 2 + assert len(sys_metrics) == 3 assert sys_metrics[0].name == "output_token_throughput" assert sys_metrics[0].unit == "per sec" - assert sys_metrics[1].name == "request_throughput" + assert sys_metrics[1].name == "request_goodput" assert sys_metrics[1].unit == "per sec" + assert sys_metrics[2].name == "request_throughput" + assert sys_metrics[2].unit == "per sec" + def test_llm_metrics_get_base_name(self) -> None: """Test get_base_name method in LLMMetrics class."""