Eliminate NCCL index-exchange from A2A setup via local send_info computation

rayg1234 · rayg1234 · commit 4b6289dfa08a · 2026-04-30T05:12:39.000Z
Compute send_info (which local atoms to send to which ranks) directly in
filter_edges_by_node_partition using the full pre-filter edge_index, instead
of requiring an NCCL all_to_all collective in build_gp_context. This removes
the most expensive collective from the per-step setup path.

Also fixes Morton Z-order balanced partition (i*P//N instead of ceil-based)
and adds record_function tracing annotations.

Validated: 27 graph_parallel + 10 escn_md tests pass.
Benchmarked: 4.7-11% speedup at 64 GPUs (exp 18), parity at 8 GPUs (exp 17).
diff --git a/src/fairchem/core/graph/compute.py b/src/fairchem/core/graph/compute.py
@@ -22,27 +22,89 @@ def filter_edges_by_node_partition(
     cell_offsets: torch.Tensor,
     neighbors: torch.Tensor,
     num_atoms: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Filter edges to keep only those where the target atom belongs to the node partition.
-    edge_index is shape (2, num_edges) where the first row is the source atom index and the second row is the target atom index for each edge
-    cell_offsets is shape (num_edges, 3)
-    neighbors is cardinality of the edge_index per system in the batch
+    rank_assignments: torch.Tensor | None = None,
+    rank: int | None = None,
+    world_size: int | None = None,
+) -> (
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    | tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]
+):
+    """
+    Filter edges to keep only those where the target atom belongs to
+    the node partition.
+
+    When rank_assignments, rank, and world_size are provided, also
+    computes send_info: which local atoms need to be sent to which
+    ranks for all-to-all graph parallel communication. This exploits
+    access to the full (pre-filter) edge_index to derive send
+    metadata locally, eliminating the need for an NCCL index-exchange
+    collective in build_gp_context.
 
     Args:
-        node_partition: Tensor of atom indices belonging to the current rank's partition.
-        edge_index: Edge index tensor of shape (2, num_edges), where row 0 is the source and 1 is the target atom.
-        cell_offsets: Cell offsets tensor of shape (num_edges, 3).
-        neighbors: Tensor with edge count per system in the batch (length = num_systems).
-        num_atoms: Total number of atoms across all batches. Used to create a boolean mask for filtering.
+        node_partition: Atom indices in the current rank's partition.
+        edge_index: Full edge index, shape (2, num_edges).
+        cell_offsets: Cell offsets, shape (num_edges, 3).
+        neighbors: Edge count per system in the batch.
+        num_atoms: Total atoms across all batches.
+        rank_assignments: Rank for each atom, shape (num_atoms,).
+            If provided along with rank and world_size, send_info
+            is computed and returned as a 4th element.
+        rank: This rank's GP rank.
+        world_size: GP world size.
 
     Returns:
-        Filtered edge_index, cell_offsets, and neighbors tensors.
+        Filtered (edge_index, cell_offsets, neighbors).
+        If rank_assignments is provided, also returns send_info dict
+        with keys: send_counts, send_indices_global.
     """
     target_atoms = edge_index[1]
     node_mask = torch.zeros(num_atoms, dtype=torch.bool, device=target_atoms.device)
     node_mask[node_partition] = True
     local_edge_mask = node_mask[target_atoms]
 
+    # Compute send info BEFORE discarding non-local edges.
+    # An edge (src, tgt) where src is LOCAL and tgt is REMOTE means
+    # src must be sent to rank_assignments[tgt].
+    send_info = None
+    if rank_assignments is not None and rank is not None and world_size is not None:
+        src_is_local = node_mask[edge_index[0]]
+        tgt_is_remote = ~local_edge_mask
+        send_edge_mask = src_is_local & tgt_is_remote
+
+        if send_edge_mask.any():
+            send_src = edge_index[0, send_edge_mask]
+            send_dst_rank = rank_assignments[edge_index[1, send_edge_mask]]
+
+            # Unique (dst_rank, src_atom) pairs, sorted by rank then atom.
+            # Key layout: dst_rank * num_atoms + src_atom ensures rank-major
+            # ordering, matching what _fused_index_exchange produces.
+            key = send_dst_rank.to(torch.long) * num_atoms + send_src.to(torch.long)
+            unique_keys = key.unique(sorted=True)
+            send_ranks = unique_keys // num_atoms
+            send_atoms = unique_keys % num_atoms
+
+            send_counts = torch.zeros(
+                world_size, dtype=torch.long, device=edge_index.device
+            )
+            send_counts.scatter_add_(
+                0,
+                send_ranks,
+                torch.ones_like(send_ranks),
+            )
+            send_info = {
+                "send_counts": send_counts,
+                "send_indices_global": send_atoms,
+            }
+        else:
+            send_info = {
+                "send_counts": torch.zeros(
+                    world_size, dtype=torch.long, device=edge_index.device
+                ),
+                "send_indices_global": torch.empty(
+                    0, dtype=torch.long, device=edge_index.device
+                ),
+            }
+
     # Create system index for each edge to track which system each edge belongs to
     num_systems = neighbors.shape[0]
     edge_system_idx = torch.repeat_interleave(
@@ -55,6 +117,8 @@ def filter_edges_by_node_partition(
     if neighbors.shape[0] == 1:
         # If there's only one system, we can skip the scatter_add step and just return the count of remaining edges
         new_neighbors = local_edge_mask.sum(dtype=neighbors.dtype).unsqueeze(0)
+        if send_info is not None:
+            return edge_index, cell_offsets, new_neighbors, send_info
         return edge_index, cell_offsets, new_neighbors
 
     filtered_edge_system_idx = edge_system_idx[local_edge_mask]
@@ -69,6 +133,8 @@ def filter_edges_by_node_partition(
         torch.ones_like(filtered_edge_system_idx, dtype=neighbors.dtype),
     )
 
+    if send_info is not None:
+        return edge_index, cell_offsets, new_neighbors, send_info
     return edge_index, cell_offsets, new_neighbors
 
 
@@ -123,8 +189,12 @@ def generate_graph(
     radius_pbc_version: int,
     pbc: torch.Tensor,
     node_partition: torch.Tensor | None = None,
+    rank_assignments: torch.Tensor | None = None,
+    rank: int | None = None,
+    world_size: int | None = None,
 ) -> dict:
-    """Generate a graph representation from atomic structure data.
+    """
+    Generate a graph representation from atomic structure data.
 
     Args:
         data (dict): A dictionary containing a batch of molecular structures.
@@ -138,6 +208,9 @@ def generate_graph(
         radius_pbc_version: the version of radius_pbc impl (1, 2, or 3 for NVIDIA)
         pbc (list[bool]): The periodic boundary conditions in 3 dimensions, defaults to [True,True,True] for 3D pbc
         node_partition (torch.Tensor | None): The partitioning of the nodes (atoms) for distributed inference. If provided, returned graph will be filtered to keep only edges where the target atom (edge_index[1,:]) belongs to the current rank's partition.
+        rank_assignments: Rank for each atom (for A2A send_info).
+        rank: This rank's GP rank (for A2A send_info).
+        world_size: GP world size (for A2A send_info).
 
     Returns:
         dict: A dictionary containing the generated graph with the following keys:
@@ -147,6 +220,7 @@ def generate_graph(
             - 'cell_offsets' (torch.Tensor): Offsets of the cell vectors for each edge.
             - 'offset_distances' (torch.Tensor): Distances between the atoms connected by the edges, including the cell offsets.
             - 'neighbors' (torch.Tensor): Number of neighbors for each atom.
+            - 'send_info' (dict, optional): Send metadata for A2A GP when rank_assignments is provided.
     """
     if radius_pbc_version == 1:
         radius_graph_pbc_fn = radius_graph_pbc
@@ -168,14 +242,22 @@ def generate_graph(
     )
 
     # for v2 it is still faster right now to not do this post filtering, need to investigate further
+    send_info = None
     if node_partition is not None and radius_pbc_version != 2:
-        edge_index, cell_offsets, neighbors = filter_edges_by_node_partition(
+        filter_result = filter_edges_by_node_partition(
             node_partition,
             edge_index,
             cell_offsets,
             neighbors,
             num_atoms=data.pos.shape[0],
+            rank_assignments=rank_assignments,
+            rank=rank,
+            world_size=world_size,
         )
+        if rank_assignments is not None:
+            edge_index, cell_offsets, neighbors, send_info = filter_result
+        else:
+            edge_index, cell_offsets, neighbors = filter_result
 
     out = get_pbc_distances(
         data.pos,
@@ -192,11 +274,14 @@ def generate_graph(
     cell_offset_distances = out["offsets"]
     distance_vec = out["distance_vec"]
 
-    return {
+    result = {
         "edge_index": edge_index,
         "edge_distance": edge_dist,
         "edge_distance_vec": distance_vec,
         "cell_offsets": cell_offsets,
         "offset_distances": cell_offset_distances,
         "neighbors": neighbors,
     }
+    if send_info is not None:
+        result["send_info"] = send_info
+    return result
diff --git a/src/fairchem/core/models/uma/escn_md_block.py b/src/fairchem/core/models/uma/escn_md_block.py
@@ -172,9 +172,10 @@ def forward(
             local_node_offset = 0
         elif gp_utils.initialized():
             # Legacy all-gather path
-            x_full = gp_utils.gather_from_model_parallel_region_sum_grad(
-                x, total_atoms_across_gp_ranks
-            )
+            with record_function("allgather_collect"):
+                x_full = gp_utils.gather_from_model_parallel_region_sum_grad(
+                    x, total_atoms_across_gp_ranks
+                )
             edge_index_local = edge_index
             local_node_offset = node_offset
         else:
diff --git a/src/fairchem/core/models/uma/graph_parallel.py b/src/fairchem/core/models/uma/graph_parallel.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch import distributed as dist
+from torch.profiler import record_function
 
 from fairchem.core.common import gp_utils
 
@@ -199,15 +200,15 @@ def partition_atoms_spatial(
     x, y, z = norm[:, 0], norm[:, 1], norm[:, 2]
     morton = _expand_bits_10(x) | (_expand_bits_10(y) << 1) | (_expand_bits_10(z) << 2)
 
-    # Sort by Morton code and assign to ranks in equal chunks
+    # Sort by Morton code and assign to ranks in balanced chunks.
+    # Use ``i * P // N`` mapping (not ``i // ceil(N/P)``) to ensure
+    # EVERY rank receives at least ``floor(N/P)`` atoms.  The ceil-based
+    # formula leaves trailing ranks empty when N is not a multiple of P
+    # (e.g. 1000 atoms / 64 ranks → rank 63 gets 0 atoms, causing a
+    # hang in collective communication).
     _, sorted_indices = morton.sort()
-    chunk_size = (N + num_ranks - 1) // num_ranks
     assignments = torch.empty(N, dtype=torch.long, device=device)
-    assignments[sorted_indices] = torch.div(
-        torch.arange(N, device=device),
-        chunk_size,
-        rounding_mode="floor",
-    ).clamp(max=num_ranks - 1)
+    assignments[sorted_indices] = torch.arange(N, device=device) * num_ranks // N
 
     return assignments
 
@@ -276,11 +277,13 @@ def partition_atoms_index_split(
     return assignments
 
 
+@torch.compiler.disable
 def build_gp_context(
     edge_index: torch.Tensor,
     rank_assignments: torch.Tensor,
     rank: int,
     world_size: int,
+    send_info: dict | None = None,
 ) -> GPContext:
     """
     Build the GP context from edge connectivity and atom assignments.
@@ -289,17 +292,26 @@ def build_gp_context(
     other ranks), exchanges atom indices via a single fused all-to-all,
     and computes all communication metadata.
 
-    Uses a single padded all-to-all collective (instead of the previous
-    2-step approach of count exchange + index exchange) by padding atom
-    index lists to a fixed size per rank. This halves the number of
-    collective operations in the setup path.
+    When send_info is provided (pre-computed during graph filtering in
+    filter_edges_by_node_partition), the NCCL index-exchange collective
+    is skipped entirely — send_counts and send_indices_global are taken
+    directly from send_info. This eliminates the most expensive collective
+    in the setup path.
 
     Args:
-        edge_index: Full graph edge index, shape (2, num_edges).
+        edge_index: Edge index filtered to edges whose targets are in
+            this rank's partition, shape (2, num_local_edges).
             Row 0 = source, row 1 = target.
         rank_assignments: Rank assignment for each atom, shape (total_atoms,).
         rank: This rank's GP rank.
         world_size: GP world size.
+        send_info: Pre-computed send metadata from graph filtering.
+            If provided, must contain:
+            - send_counts: Tensor of shape (world_size,) with count of
+              atoms to send to each rank.
+            - send_indices_global: Tensor of global atom indices to send,
+              sorted by destination rank.
+            When provided, _fused_index_exchange is skipped.
 
     Returns:
         GPContext with all metadata needed for all-to-all communication.
@@ -341,15 +353,21 @@ def build_gp_context(
 
     # Fused count + index exchange: single padded all-to-all replaces
     # the old 2-step approach (count exchange + index exchange).
-    send_counts, send_indices_global = _fused_index_exchange(
-        needed_atoms=needed_atoms,
-        needed_from_ranks=needed_from_ranks,
-        recv_counts=recv_counts,
-        rank=rank,
-        world_size=world_size,
-        total_atoms=total_atoms,
-        device=device,
-    )
+    if send_info is not None:
+        # Pre-computed during graph filtering — skip NCCL collective.
+        send_counts = send_info["send_counts"]
+        send_indices_global = send_info["send_indices_global"]
+    else:
+        with record_function("a2a_fused_index_exchange"):
+            send_counts, send_indices_global = _fused_index_exchange(
+                needed_atoms=needed_atoms,
+                needed_from_ranks=needed_from_ranks,
+                recv_counts=recv_counts,
+                rank=rank,
+                world_size=world_size,
+                total_atoms=total_atoms,
+                device=device,
+            )
 
     # Build global_to_local mapping:
     # Local atoms: index 0..total_local_atoms-1 (in order of node_partition)
diff --git a/tests/core/models/uma/test_graph_parallel.py b/tests/core/models/uma/test_graph_parallel.py