Skip to content

Commit

Permalink
Fix failing condor_suspend on not running resources
Browse files Browse the repository at this point in the history
  • Loading branch information
giffels committed Feb 4, 2025
1 parent 3b08b88 commit 4813f6d
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 7 deletions.
4 changes: 2 additions & 2 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.. Created by changelog.py at 2024-12-18, command
'/Users/giffler/.cache/pre-commit/repoecmh3ah8/py_env-python3.12/bin/changelog docs/source/changes compile --categories Added Changed Fixed Security Deprecated --output=docs/source/changelog.rst'
.. Created by changelog.py at 2025-02-04, command
'/Users/giffler/.cache/pre-commit/repoecmh3ah8/py_env-python3.13/bin/changelog docs/source/changes compile --categories Added Changed Fixed Security Deprecated --output=docs/source/changelog.rst'
based on the format of 'https://keepachangelog.com/'
#########
Expand Down
4 changes: 3 additions & 1 deletion tardis/adapters/sites/htcondor.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,14 @@ async def _condor_tool(
except CommandExecutionFailure as cef:
# the tool fails if none of the jobs are found – because they all just shut down
# report graceful failure for all
if cef.exit_code == 1 and "not found" in cef.stderr:
handle_error_msgs = ("not found", "not running to be")
if cef.exit_code == 1 and any(msg in cef.stderr for msg in handle_error_msgs):
return [False] * len(resource_attributes)
raise
# successes are in stdout, failures in stderr, both in argument order
# stdout: Job 15540.0 marked for removal
# stderr: Job 15612.0 not found
# stderr: Job 15611.0 not running to be suspended
# stderr: Job 15535.0 marked for removal
success_jobs = {
TOOL_ID_PATTERN.search(line).group(1)
Expand Down
28 changes: 24 additions & 4 deletions tests/adapters_t/sites_t/test_htcondorsiteadapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@
CONDOR_RM_FAILED_MESSAGE = "Run command condor_rm 1351043.0 via ShellExecutor failed"

CONDOR_SUSPEND_OUTPUT = """Job 1351043.0 suspended"""
CONDOR_SUSPEND_FAILED_OUTPUT = """Job 1351043.0 not found"""
CONDOR_SUSPEND_FAILED_OUTPUT_NOT_FOUND = """Job 1351043.0 not found"""
CONDOR_SUSPEND_FAILED_OUTPUT_NOT_RUNNING = (
"""Job 1351043.0 not running to be suspended"""
)
CONDOR_SUSPEND_FAILED_MESSAGE = """Run command condor_suspend 1351043 via
ShellExecutor failed"""

Expand Down Expand Up @@ -379,12 +382,29 @@ def test_stop_resource(self):
raise_exception=CommandExecutionFailure(
message=CONDOR_SUSPEND_FAILED_MESSAGE,
exit_code=1,
stderr=CONDOR_SUSPEND_FAILED_OUTPUT,
stderr=CONDOR_SUSPEND_FAILED_OUTPUT_NOT_FOUND,
stdout="",
stdin="",
),
)
def test_stop_resource_failed_redo(self):
def test_stop_resource_failed_redo_not_found(self):
with self.assertRaises(TardisResourceStatusUpdateFailed):
run_async(
self.adapter.stop_resource,
AttributeDict(remote_resource_uuid="1351043.0"),
)

@mock_executor_run_command(
stdout="",
raise_exception=CommandExecutionFailure(
message=CONDOR_SUSPEND_FAILED_MESSAGE,
exit_code=1,
stderr=CONDOR_SUSPEND_FAILED_OUTPUT_NOT_RUNNING,
stdout="",
stdin="",
),
)
def test_stop_resource_failed_redo_not_running(self):
with self.assertRaises(TardisResourceStatusUpdateFailed):
run_async(
self.adapter.stop_resource,
Expand All @@ -396,7 +416,7 @@ def test_stop_resource_failed_redo(self):
raise_exception=CommandExecutionFailure(
message=CONDOR_SUSPEND_FAILED_MESSAGE,
exit_code=2,
stderr=CONDOR_SUSPEND_FAILED_OUTPUT,
stderr=CONDOR_SUSPEND_FAILED_OUTPUT_NOT_FOUND,
stdout="",
stdin="",
),
Expand Down

0 comments on commit 4813f6d

Please sign in to comment.