From 701e97286cb2d95e42970d49974a00a7c5eef79b Mon Sep 17 00:00:00 2001 From: Yikai Gao Date: Tue, 17 Dec 2024 13:01:43 -0800 Subject: [PATCH] run par as an entrypoint if there is no patch or jetter patch. (#994) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/994 # Context: Currently, when running torchx local job, we are using penv_python as entrypoint. That means we pass the actual .par or .xar file as argument to penv_python. within penv_python, the par/xar is executed as a new process. # Old way to run torchx local job. For example, if the local job is running "jetter --help", torchx runs it like: PENV_PAR='/data/users/yikai/fbsource/buck-out/v2/gen/fbcode/a6cb9616985b22b0/jetter/__jetter-bin__/jetter-bin-inplace.par' penv_python -m jetter.main --help It passes the par file as an environment variable called "PENV_PAR"(There is another way to pass this to penv_python, which is passing 'PENV_PARNAME' as env variable then get the par file's path using it. But it is very very rare, only 0.1% of total usage.) # New way to run torchx local job After migration, We will run it like: PAR_MAIN_OVERRIDE=jetter.main /data/users/yikai/fbsource/buck-out/v2/gen/fbcode/a6cb9616985b22b0/jetter/__jetter-bin__/jetter-bin-inplace.par --help NOTE: This diff only migrates one of the most common use cases, which: 1. There are no patch or jetter patch. 2. it's a par not xar. 3. the par file is passed via "PENV_PAR" env variable. For other use cases, we still run penv_python as entrypoint. Reviewed By: Sanjay-Ganeshan Differential Revision: D66621649 --- torchx/schedulers/local_scheduler.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index aa899b1d..3e11fee4 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -28,6 +28,7 @@ import warnings from dataclasses import asdict, dataclass from datetime import datetime +from subprocess import Popen from types import FrameType from typing import ( Any, @@ -696,12 +697,11 @@ def _popen( log.debug(f"Running {role_name} (replica {replica_id}):\n {args_pfmt}") env = self._get_replica_env(replica_params) - proc = subprocess.Popen( + proc = self.run_local_job( args=replica_params.args, env=env, stdout=stdout_, stderr=stderr_, - start_new_session=True, cwd=replica_params.cwd, ) return _LocalReplica( @@ -714,6 +714,23 @@ def _popen( error_file=env.get("TORCHELASTIC_ERROR_FILE", ""), ) + def run_local_job( + self, + args: List[str], + env: Dict[str, str], + stdout: Optional[io.FileIO], + stderr: Optional[io.FileIO], + cwd: Optional[str] = None, + ) -> Popen[bytes]: + return subprocess.Popen( + args=args, + env=env, + stdout=stdout, + stderr=stderr, + start_new_session=True, + cwd=cwd, + ) + def _get_replica_output_handles( self, replica_params: ReplicaParam,