Skip to content

Commit e940fd8

Browse files
: provisioner: explicit monarch hostmesh shutdown (#644)
1 parent 967c9f0 commit e940fd8

File tree

1 file changed

+23
-1
lines changed

1 file changed

+23
-1
lines changed

src/forge/controller/provisioner.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,14 @@
2222
from monarch._src.actor.actor_mesh import ActorMesh
2323
from monarch._src.actor.shape import Extent
2424

25-
from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
25+
from monarch.actor import (
26+
Actor,
27+
endpoint,
28+
HostMesh,
29+
ProcMesh,
30+
shutdown_context,
31+
this_host,
32+
)
2633

2734
from monarch.tools import commands
2835
from monarch.utils import setup_env_for_distributed
@@ -486,6 +493,21 @@ async def shutdown_all_allocations(self):
486493
self._registered_actors.clear()
487494
self._registered_services.clear()
488495

496+
# -- HostMeshes (including the implicit local host) ---
497+
logger.info(f"Shutting down {len(self._host_mesh_map)} HostMesh(es)...")
498+
results = await asyncio.gather(
499+
*[host_mesh.shutdown() for host_mesh in self._host_mesh_map.values()],
500+
return_exceptions=True,
501+
)
502+
for (name, _), result in zip(self._host_mesh_map.items(), results, strict=True):
503+
if isinstance(result, Exception):
504+
logger.warning(f"Failed to shutdown HostMesh {name}: {result}")
505+
self._host_mesh_map.clear()
506+
try:
507+
await shutdown_context()
508+
except Exception as e:
509+
logger.warning(f"Failed to shutdown context: {e}")
510+
489511
async def shutdown(self):
490512
"""Tears down all remaining remote allocations."""
491513
await self.shutdown_all_allocations()

0 commit comments

Comments
 (0)