Skip to content

Commit

Permalink
[Misc] Remove deprecated code (#12383)
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 authored Jan 24, 2025
1 parent ab5bbf5 commit df5dafa
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 78 deletions.
23 changes: 14 additions & 9 deletions tests/async_engine/test_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
]

if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands)
yield
uvicorn_process.terminate()


@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, tokenizer_pool_size: int,
worker_use_ray: bool):
distributed_executor_backend: str):
"""
Run the API server and test it.
Expand Down
18 changes: 9 additions & 9 deletions tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def check_settings():


@pytest.fixture
def worker_use_ray() -> bool:
# When SPMD worker is used, use ray_use_worker=True
def distributed_executor_backend() -> str:
# When SPMD worker is used, use distributed_executor_backend="ray"
# to test delta input optimization works with preemption.
return envs.VLLM_USE_RAY_SPMD_WORKER
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"


@pytest.mark.parametrize("model", MODELS)
Expand All @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute(
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256)
Expand All @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
Expand All @@ -93,7 +93,7 @@ def test_preemption(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""By default, recompute preemption is enabled"""

Expand All @@ -104,7 +104,7 @@ def test_preemption(
model,
dtype=dtype,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
Expand Down Expand Up @@ -144,7 +144,7 @@ def test_preemption_infeasible(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
Expand All @@ -159,7 +159,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
Expand Down
3 changes: 2 additions & 1 deletion tests/multi_step/test_correctness_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
NUM_PROMPTS = [10]

DEFAULT_SERVER_ARGS: List[str] = [
"--worker-use-ray",
"--distributed-executor-backend",
"ray",
"--gpu-memory-utilization",
"0.85",
"--swap-space",
Expand Down
10 changes: 0 additions & 10 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,9 +1227,6 @@ class ParallelConfig:
pipeline_parallel_size: int = 1 # Number of pipeline parallel groups.
tensor_parallel_size: int = 1 # Number of tensor parallel groups.

# Deprecated, use distributed_executor_backend instead.
worker_use_ray: Optional[bool] = None

# Maximum number of multiple batches
# when load model sequentially. To avoid RAM OOM when using tensor
# parallel and large models.
Expand Down Expand Up @@ -1283,13 +1280,6 @@ def __post_init__(self) -> None:
self.world_size = self.pipeline_parallel_size * \
self.tensor_parallel_size

if self.worker_use_ray:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
elif not self.use_ray:
raise ValueError(f"worker-use-ray can't be used with "
f"distributed executor backend "
f"'{self.distributed_executor_backend}'.")
ray_only_devices = ["tpu"]
from vllm.platforms import current_platform
if (current_platform.device_type in ray_only_devices
Expand Down
6 changes: 0 additions & 6 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ class EngineArgs:
kv_cache_dtype: str = 'auto'
seed: int = 0
max_model_len: Optional[int] = None
worker_use_ray: bool = False
# Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without
# notice.
Expand Down Expand Up @@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'only supports Ray for distributed inference.')

parser.add_argument(
'--worker-use-ray',
action='store_true',
help='Deprecated, use ``--distributed-executor-backend=ray``.')
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
Expand Down Expand Up @@ -1071,7 +1066,6 @@ def create_engine_config(self,
parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size,
worker_use_ray=self.worker_use_ray,
max_parallel_loading_workers=self.max_parallel_loading_workers,
disable_custom_all_reduce=self.disable_custom_all_reduce,
tokenizer_pool_config=TokenizerPoolConfig.create_config(
Expand Down
43 changes: 0 additions & 43 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
documentation="Number of emitted tokens.",
labelnames=labelnames))

# Deprecated in favor of vllm:prompt_tokens_total
self.gauge_avg_prompt_throughput = self._gauge_cls(
name="vllm:avg_prompt_throughput_toks_per_s",
documentation="Average prefill throughput in tokens/s.",
labelnames=labelnames,
multiprocess_mode="sum",
)
# Deprecated in favor of vllm:generation_tokens_total
self.gauge_avg_generation_throughput = self._gauge_cls(
name="vllm:avg_generation_throughput_toks_per_s",
documentation="Average generation throughput in tokens/s.",
labelnames=labelnames,
multiprocess_mode="sum",
)


# end-metrics-definitions

Expand Down Expand Up @@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None:
self._log_histogram(self.metrics.histogram_max_tokens_request,
stats.max_tokens_requests)

def _log_prometheus_interval(self, prompt_throughput: float,
generation_throughput: float) -> None:
# Logs metrics to prometheus that are computed every logging_interval.
# Support legacy gauge metrics that make throughput calculations on
# the vLLM side. Moving forward, we should use counters like
# counter_prompt_tokens, counter_generation_tokens
# Which log raw data and calculate summaries using rate() on the
# grafana/prometheus side. See
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
self.metrics.gauge_avg_prompt_throughput.labels(
**self.labels).set(prompt_throughput)
self.metrics.gauge_avg_generation_throughput.labels(
**self.labels).set(generation_throughput)

def log(self, stats: Stats):
"""Logs to prometheus and tracked stats every iteration."""
# Log to prometheus.
Expand All @@ -664,20 +635,6 @@ def log(self, stats: Stats):
# Log locally every local_interval seconds.
if local_interval_elapsed(stats.now, self.last_local_log,
self.local_interval):
# Compute summary metrics for tracked stats (and log them
# to promethus if applicable).
prompt_throughput = get_throughput(self.num_prompt_tokens,
now=stats.now,
last_log=self.last_local_log)
generation_throughput = get_throughput(
self.num_generation_tokens,
now=stats.now,
last_log=self.last_local_log)

self._log_prometheus_interval(
prompt_throughput=prompt_throughput,
generation_throughput=generation_throughput)

if self.spec_decode_metrics is not None:
self._log_gauge(
self.metrics.gauge_spec_decode_draft_acceptance_rate,
Expand Down

0 comments on commit df5dafa

Please sign in to comment.