Remove communication-computation overlap code

rgao user · rgao user · commit dba967bc6440 · 2026-05-01T05:59:50.000Z
Benchmarked overlap approach (splitting forward_chunk into local + boundary
edge passes while A2A runs async) on H200 turbo mode:
- 8 GPU: ~17% slower than non-overlap A2A
- 16 GPU: ~12% slower than non-overlap A2A

Root cause: splitting forward_chunk into two calls loses torch.compile kernel
fusion efficiency, costing more than the ~2ms communication latency hidden
per layer. The overhead from two separate SO2Conv passes (each with its own
kernel launch, sync, scatter/gather) dominates.

Removed:
- _forward_overlap() method from Edgewise
- start_all_to_all_collect() / finish_all_to_all_collect() from graph_parallel
- Edge reorder pre-sorting from escn_md backbone
- Overlap-related test functions
diff --git a/src/fairchem/core/models/uma/escn_md.py b/src/fairchem/core/models/uma/escn_md.py
@@ -7,7 +7,6 @@
 
 from __future__ import annotations
 
-import dataclasses
 import logging
 import os
 import types
@@ -1013,25 +1012,6 @@ def forward(self, data_dict: AtomicData) -> dict[str, torch.Tensor]:
         with record_function("layer_radial_emb"):
             x_edge_per_layer = self.backend.get_layer_radial_emb(x_edge, self)
 
-        # When overlap is enabled, pre-sort all per-edge tensors so local
-        # edges come first and boundary edges last. This lets the overlap
-        # path use compile-friendly split() instead of boolean indexing.
-        if (
-            gp_ctx is not None
-            and gp_ctx.edge_reorder is not None
-            and self.use_overlap_gp
-        ):
-            reorder = gp_ctx.edge_reorder
-            wigner = wigner[reorder]
-            wigner_inv_envelope = wigner_inv_envelope[reorder]
-            x_edge_per_layer = [xl[reorder] for xl in x_edge_per_layer]
-            gp_ctx = dataclasses.replace(
-                gp_ctx,
-                edge_index_local=gp_ctx.edge_index_local[:, reorder],
-                local_edge_mask=gp_ctx.local_edge_mask[reorder],
-                edge_reorder=None,  # consumed; don't reorder again
-            )
-
         for i in range(self.num_layers):
             with record_function(f"message passing {i}"):
                 x_message = self.blocks[i](
diff --git a/src/fairchem/core/models/uma/escn_md_block.py b/src/fairchem/core/models/uma/escn_md_block.py
@@ -20,8 +20,6 @@
     GPContext,
     all_to_all_collect,
     all_to_all_collect_compiled,
-    finish_all_to_all_collect,
-    start_all_to_all_collect,
 )
 from fairchem.core.models.uma.nn.activation import (
     GateActivation,
@@ -138,38 +136,7 @@ def forward(
 
         When gp_ctx is provided, uses all-to-all to collect only the
         needed remote embeddings. Otherwise falls back to all-gather.
-
-        When use_overlap_gp is True and in eval mode, overlaps
-        communication with local edge computation for better latency.
         """
-        # Check if we should use the overlapped path:
-        # - gp_ctx must be provided (all-to-all mode)
-        # - use_overlap_gp must be enabled
-        # - must NOT be in training mode (overlap path doesn't support autograd)
-        # - must NOT need gradients (autograd forces/stress require
-        #   autograd-compatible communication, overlap path doesn't provide this)
-        # - must NOT use activation checkpointing (incompatible with edge split)
-        # - must have both local and boundary edges
-        needs_grad_for_overlap = torch.is_grad_enabled() and (
-            x.requires_grad if isinstance(x, torch.Tensor) else False
-        )
-        use_overlap = (
-            self.use_overlap_gp
-            and gp_ctx is not None
-            and gp_utils.initialized()
-            and not self.training
-            and not needs_grad_for_overlap
-            and self.activation_checkpoint_chunk_size is None
-            and gp_ctx.local_edge_mask is not None
-            and gp_ctx.num_local_edges > 0
-            and gp_ctx.num_boundary_edges > 0
-        )
-
-        if use_overlap:
-            return self._forward_overlap(
-                x, x_edge, wigner, wigner_inv_envelope, gp_ctx, send_indices
-            )
-
         if gp_ctx is not None and gp_utils.initialized():
             # All-to-all path: collect only needed remote embeddings.
             # When x requires grad (autograd forces/stress), we use the
@@ -252,85 +219,6 @@ def forward(
                 new_embeddings = [torch.stack(new_embeddings).sum(axis=0)]
         return torch.stack(new_embeddings).sum(axis=0)
 
-    def _forward_overlap(
-        self,
-        x,
-        x_edge,
-        wigner,
-        wigner_inv_envelope,
-        gp_ctx: GPContext,
-        send_indices: torch.Tensor | None,
-    ):
-        """
-        Overlapped communication-computation forward pass.
-
-        Overlaps the all-to-all communication with local edge
-        computation for better inference latency. Only used in
-        eval mode (no autograd through the communication).
-
-        Edges are pre-sorted in build_gp_context (local edges first,
-        boundary edges last via edge_reorder). This allows using
-        compile-friendly split() instead of boolean indexing.
-
-        Steps:
-        1. Start async all-to-all to exchange boundary embeddings.
-        2. Compute local edges (both endpoints are local atoms)
-           while communication is in flight.
-        3. Wait for communication to complete.
-        4. Compute boundary edges (source is remote).
-        5. Sum local + boundary contributions.
-        """
-        edge_index_local = gp_ctx.edge_index_local
-        num_local_atoms = x.shape[0]
-        n_local = gp_ctx.num_local_edges
-
-        # Split pre-sorted per-edge data: local first, boundary last.
-        # No boolean indexing — compile-friendly.
-        local_edge_idx = edge_index_local[:, :n_local]
-        boundary_edge_idx = edge_index_local[:, n_local:]
-        local_x_edge = x_edge[:n_local]
-        boundary_x_edge = x_edge[n_local:]
-        local_wigner = wigner[:n_local]
-        boundary_wigner = wigner[n_local:]
-        local_wigner_inv = wigner_inv_envelope[:n_local]
-        boundary_wigner_inv = wigner_inv_envelope[n_local:]
-
-        # Step 1: Start async all-to-all
-        with record_function("a2a_collect_async_start"):
-            recv_buf, work_handles = start_all_to_all_collect(x, gp_ctx, send_indices)
-
-        # Step 2: Compute local edges while comm is in flight
-        with record_function("local_edges"):
-            local_contribution = self.forward_chunk(
-                x,
-                num_local_atoms,
-                local_x_edge,
-                local_edge_idx,
-                local_wigner,
-                local_wigner_inv,
-                0,
-            )
-
-        # Step 3: Wait for communication
-        with record_function("a2a_collect_async_wait"):
-            x_received = finish_all_to_all_collect(recv_buf, work_handles)
-            x_full = torch.cat([x, x_received], dim=0)
-
-        # Step 4: Compute boundary edges
-        with record_function("boundary_edges"):
-            boundary_contribution = self.forward_chunk(
-                x_full,
-                num_local_atoms,
-                boundary_x_edge,
-                boundary_edge_idx,
-                boundary_wigner,
-                boundary_wigner_inv,
-                0,
-            )
-
-        # Step 5: Sum contributions
-        return local_contribution + boundary_contribution
-
     def forward_chunk(
         self,
         x_full,
diff --git a/src/fairchem/core/models/uma/graph_parallel.py b/src/fairchem/core/models/uma/graph_parallel.py
@@ -1014,120 +1014,3 @@ def all_to_all_collect_p2p(
         _safe_all_to_all(recv_chunks, send_chunks, group=gp_group)
 
     return x_recv
-
-
-@torch.compiler.disable
-def start_all_to_all_collect(
-    x_local: torch.Tensor,
-    gp_ctx: GPContext,
-    send_indices: torch.Tensor,
-) -> tuple[torch.Tensor, list[dist.Work]]:
-    """
-    Start async all-to-all communication for comm-compute overlap.
-
-    Launches the all-to-all without waiting for completion. Returns
-    the pre-allocated receive buffer and work handles. The caller
-    should do useful compute, then call ``finish_all_to_all_collect``
-    to wait for completion and get the received embeddings.
-
-    This function does NOT participate in autograd. For differentiable
-    all-to-all, use ``all_to_all_collect`` instead. This async variant
-    is intended for the overlap path where gradients are handled
-    separately.
-
-    Uses ``all_to_all_single`` on NCCL for efficiency (avoids Python
-    list creation from ``split()``).
-
-    Args:
-        x_local: Local atom embeddings, shape (local_atoms, *features).
-        gp_ctx: Graph parallel context.
-        send_indices: Local indices of atoms to send.
-
-    Returns:
-        Tuple of (recv_buffer, work_handles):
-            recv_buffer: Pre-allocated tensor for received embeddings.
-            work_handles: List of dist.Work handles to wait on.
-    """
-    feature_shape = x_local.shape[1:]
-
-    # Gather atoms to send
-    if send_indices.numel() > 0:
-        x_send = x_local[send_indices].contiguous()
-    else:
-        x_send = torch.empty(
-            0, *feature_shape, device=x_local.device, dtype=x_local.dtype
-        )
-
-    # Use precomputed splits if available (avoids .tolist() per layer)
-    send_splits = (
-        gp_ctx.send_splits
-        if gp_ctx.send_splits is not None
-        else gp_ctx.send_counts.tolist()
-    )
-    recv_splits = (
-        gp_ctx.recv_splits
-        if gp_ctx.recv_splits is not None
-        else gp_ctx.recv_counts.tolist()
-    )
-    total_recv = (
-        gp_ctx.total_recv if gp_ctx.total_recv is not None else sum(recv_splits)
-    )
-    x_recv = torch.empty(
-        total_recv, *feature_shape, device=x_local.device, dtype=x_local.dtype
-    )
-
-    # Launch async all-to-all
-    gp_group = gp_utils.get_gp_group()
-    backend = dist.get_backend(gp_group)
-
-    work_handles = []
-    if backend == "nccl":
-        # Use all_to_all_single for NCCL — packed tensor, no list creation
-        work = dist.all_to_all_single(
-            x_recv,
-            x_send,
-            output_split_sizes=recv_splits,
-            input_split_sizes=send_splits,
-            group=gp_group,
-            async_op=True,
-        )
-        work_handles.append(work)
-    else:
-        # Gloo fallback: use pairwise send/recv
-        send_list = list(x_send.split(send_splits))
-        recv_list = list(x_recv.split(recv_splits))
-        rank = dist.get_rank(gp_group)
-        world_size = dist.get_world_size(gp_group)
-        ops = []
-        for r in range(world_size):
-            if r == rank:
-                if send_list[r].numel() > 0:
-                    recv_list[r].copy_(send_list[r])
-            elif send_list[r].numel() > 0 or recv_list[r].numel() > 0:
-                # Skip zero-length P2P ops to avoid potential hangs
-                ops.append(dist.P2POp(dist.isend, send_list[r], r, group=gp_group))
-                ops.append(dist.P2POp(dist.irecv, recv_list[r], r, group=gp_group))
-        if ops:
-            work_handles = dist.batch_isend_irecv(ops)
-
-    return x_recv, work_handles
-
-
-@torch.compiler.disable
-def finish_all_to_all_collect(
-    recv_buffer: torch.Tensor,
-    work_handles: list[dist.Work],
-) -> torch.Tensor:
-    """
-    Wait for async all-to-all to complete and return received embeddings.
-
-    Args:
-        recv_buffer: Pre-allocated receive buffer from start_all_to_all_collect.
-        work_handles: Work handles from start_all_to_all_collect.
-
-    Returns:
-        x_received: Received remote atom embeddings.
-    """
-    for work in work_handles:
-        work.wait()
-    return recv_buffer
diff --git a/tests/core/models/uma/test_graph_parallel.py b/tests/core/models/uma/test_graph_parallel.py