Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not log full worker info in retire_workers #8935

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions distributed/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7443,7 +7443,7 @@ async def retire_workers(
close_workers: bool = False,
remove: bool = True,
stimulus_id: str | None = None,
) -> dict[str, Any]: ...
) -> list[str]: ...

@overload
async def retire_workers(
Expand All @@ -7453,7 +7453,7 @@ async def retire_workers(
close_workers: bool = False,
remove: bool = True,
stimulus_id: str | None = None,
) -> dict[str, Any]: ...
) -> list[str]: ...

@overload
async def retire_workers(
Expand All @@ -7469,7 +7469,7 @@ async def retire_workers(
minimum: int | None = None,
target: int | None = None,
attribute: str = "address",
) -> dict[str, Any]: ...
) -> list[str]: ...

@log_errors
async def retire_workers(
Expand All @@ -7481,7 +7481,7 @@ async def retire_workers(
remove: bool = True,
stimulus_id: str | None = None,
**kwargs: Any,
) -> dict[str, Any]:
) -> list[str]:
"""Gracefully retire workers from cluster. Any key that is in memory exclusively
on the retired workers is replicated somewhere else.

Expand Down Expand Up @@ -7559,7 +7559,7 @@ async def retire_workers(
self.workers[address] for address in self.workers_to_close(**kwargs)
}
if not wss:
return {}
return []

stop_amm = False
amm: ActiveMemoryManagerExtension | None = self.extensions.get("amm")
Expand Down Expand Up @@ -7609,13 +7609,13 @@ async def retire_workers(
# time (depending on interval settings)
amm.run_once()

workers_info_ok = {}
workers_info_abort = {}
for addr, result, info in await asyncio.gather(*coros):
workers_info_ok = []
workers_info_abort = []
for addr, result in await asyncio.gather(*coros):
if result == "OK":
workers_info_ok[addr] = info
workers_info_ok.append(addr)
else:
workers_info_abort[addr] = info
workers_info_abort.append(addr)

finally:
if stop_amm:
Expand All @@ -7625,8 +7625,8 @@ async def retire_workers(
"all",
{
"action": "retire-workers",
"retired": workers_info_ok,
"could-not-retire": workers_info_abort,
"retired": list(workers_info_ok),
"could-not-retire": list(workers_info_abort),
"stimulus_id": stimulus_id,
},
)
Expand All @@ -7649,7 +7649,7 @@ async def _track_retire_worker(
close: bool,
remove: bool,
stimulus_id: str,
) -> tuple[str, Literal["OK", "no-recipients"], dict]:
) -> tuple[str, Literal["OK", "no-recipients"]]:
while not policy.done():
# Sleep 0.01s when there are 4 tasks or less
# Sleep 0.5s when there are 200 or more
Expand All @@ -7671,7 +7671,7 @@ async def _track_retire_worker(
f"Could not retire worker {ws.address!r}: unique data could not be "
f"moved to any other worker ({stimulus_id=!r})"
)
return ws.address, "no-recipients", ws.identity()
return ws.address, "no-recipients"

logger.debug(
f"All unique keys on worker {ws.address!r} have been replicated elsewhere"
Expand All @@ -7685,7 +7685,7 @@ async def _track_retire_worker(
self.close_worker(ws.address)

logger.info(f"Retired worker {ws.address!r} ({stimulus_id=!r})")
return ws.address, "OK", ws.identity()
return ws.address, "OK"

def add_keys(
self, worker: str, keys: Collection[Key] = (), stimulus_id: str | None = None
Expand Down
8 changes: 5 additions & 3 deletions distributed/tests/test_active_memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,7 +977,7 @@ async def test_RetireWorker_all_recipients_are_paused(c, s, a, b):

x = await c.scatter("x", workers=[a.address])
out = await c.retire_workers([a.address])
assert out == {}
assert not out
assert not s.extensions["amm"].policies
assert set(s.workers) == {a.address, b.address}

Expand Down Expand Up @@ -1230,7 +1230,7 @@ async def test_RetireWorker_with_actor(c, s, a, b, has_proxy):

with captured_logger("distributed.active_memory_manager", logging.WARNING) as log:
out = await c.retire_workers([a.address])
assert out == {}
assert not out
assert "it holds actor(s)" in log.getvalue()
assert "x" in a.state.actors

Expand All @@ -1250,7 +1250,7 @@ async def test_RetireWorker_with_actor_proxy(c, s, a, b):
assert "y" in b.data

out = await c.retire_workers([b.address])
assert out.keys() == {b.address}
assert out == (b.address,)
assert "x" in a.state.actors
assert "y" in a.data

Expand Down Expand Up @@ -1301,6 +1301,7 @@ async def tensordot_stress(c, s):
assert sum(t.start == "memory" for t in s.transition_log) == expected_tasks


@pytest.mark.slow
@gen_cluster(
client=True,
nthreads=[("", 1)] * 4,
Expand Down Expand Up @@ -1356,6 +1357,7 @@ async def test_ReduceReplicas_stress(c, s, *workers):
await tensordot_stress(c, s)


@pytest.mark.slow
@pytest.mark.parametrize("use_ReduceReplicas", [False, True])
@gen_cluster(
client=True,
Expand Down
1 change: 0 additions & 1 deletion distributed/tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,7 +1499,6 @@ async def test_retire_workers(c, s, a, b):

workers = await s.retire_workers()
assert list(workers) == [a.address]
assert workers[a.address]["nthreads"] == a.state.nthreads
assert list(s.workers) == [b.address]

assert s.workers_to_close() == []
Expand Down
2 changes: 1 addition & 1 deletion distributed/tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3553,7 +3553,7 @@ async def test_execute_preamble_abort_retirement(c, s):

# b has shut down. There's nowhere to replicate x to anymore, so retire_workers
# will give up and reinstate a to running status.
assert await retire_fut == {}
assert not await retire_fut
while a.status != Status.running:
await asyncio.sleep(0.01)

Expand Down
Loading