Remove unused overlap/P2P fields and parameters

rgao user · rgao user · commit 28d9bd3b720a · 2026-05-01T06:53:32.000Z
Clean up dead code left over from the overlap removal:
- Remove local_edge_mask, num_local_edges, num_boundary_edges,
  edge_reorder fields from GPContext dataclass
- Remove edge classification computation from build_gp_context()
- Remove use_overlap_gp and use_p2p_gp parameters from eSCNMDBlock
  and Edgewise constructors
- Remove TestEdgeClassification test class (4 tests for removed fields)
- Simplify batched GPU→CPU transfer (2 fewer scalars)
diff --git a/src/fairchem/core/models/uma/escn_md.py b/src/fairchem/core/models/uma/escn_md.py
@@ -325,8 +325,6 @@ def __init__(
         execution_mode: str = "general",
         use_all_to_all_gp: bool = False,
         gp_partition_strategy: str = "index_split",
-        use_overlap_gp: bool = False,
-        use_p2p_gp: bool = False,
     ) -> None:
         super().__init__()
         self.max_num_elements = max_num_elements
@@ -379,8 +377,6 @@ def __init__(
             )
         self.edge_chunk_size = edge_chunk_size
         self.use_all_to_all_gp = use_all_to_all_gp
-        self.use_overlap_gp = use_overlap_gp
-        self.use_p2p_gp = use_p2p_gp
         self.gp_partition_strategy = PartitionStrategy(gp_partition_strategy)
 
         self.backend = get_execution_backend(execution_mode)
@@ -514,8 +510,6 @@ def __init__(
                 self.ff_type,
                 activation_checkpoint_chunk_size=activation_checkpoint_chunk_size,
                 backend=self.backend,
-                use_overlap_gp=self.use_overlap_gp,
-                use_p2p_gp=self.use_p2p_gp,
             )
             self.blocks.append(block)
 
diff --git a/src/fairchem/core/models/uma/escn_md_block.py b/src/fairchem/core/models/uma/escn_md_block.py
@@ -59,8 +59,6 @@ def __init__(
         activation_checkpoint_chunk_size: int | None,
         backend: ExecutionBackend,
         act_type: Literal["gate", "s2"] = "gate",
-        use_overlap_gp: bool = False,
-        use_p2p_gp: bool = False,
     ):
         super().__init__()
 
@@ -69,8 +67,6 @@ def __init__(
         self.lmax = lmax
         self.mmax = mmax
         self.activation_checkpoint_chunk_size = activation_checkpoint_chunk_size
-        self.use_overlap_gp = use_overlap_gp
-        self.use_p2p_gp = use_p2p_gp
         self.backend = backend
 
         self.mappingReduced = mappingReduced
@@ -347,8 +343,6 @@ def __init__(
         ff_type: Literal["spectral", "grid"],
         activation_checkpoint_chunk_size: int | None,
         backend: ExecutionBackend,
-        use_overlap_gp: bool = False,
-        use_p2p_gp: bool = False,
     ) -> None:
         super().__init__()
         self.sphere_channels = sphere_channels
@@ -372,8 +366,6 @@ def __init__(
             act_type=act_type,
             activation_checkpoint_chunk_size=activation_checkpoint_chunk_size,
             backend=backend,
-            use_overlap_gp=use_overlap_gp,
-            use_p2p_gp=use_p2p_gp,
         )
 
         self.norm_2 = get_normalization_layer(
diff --git a/src/fairchem/core/models/uma/graph_parallel.py b/src/fairchem/core/models/uma/graph_parallel.py
@@ -100,17 +100,6 @@ class GPContext:
         edge_index_local: Precomputed edge index remapped to local indices.
             None if not yet computed (set by build_gp_context when edge_index
             is provided).
-        local_edge_mask: Boolean mask identifying fully-local edges (both src
-            and tgt are local atoms). Used for comm-compute overlap. Shape:
-            (num_edges,). None if not yet computed.
-        num_local_edges: Number of fully-local edges (precomputed from
-            local_edge_mask). None if not yet computed.
-        num_boundary_edges: Number of boundary edges (src is remote). None
-            if not yet computed.
-        edge_reorder: Permutation that sorts edges so local edges come first,
-            then boundary edges. Shape: (num_edges,). Applied once to per-edge
-            tensors (wigner, x_edge, etc.) in the backbone forward. Enables
-            compile-friendly overlap via split() instead of boolean indexing.
     """
 
     rank: int
@@ -126,10 +115,6 @@ class GPContext:
     total_needed_atoms: int
     send_indices: torch.Tensor | None = None
     edge_index_local: torch.Tensor | None = None
-    local_edge_mask: torch.Tensor | None = None
-    num_local_edges: int | None = None
-    num_boundary_edges: int | None = None
-    edge_reorder: torch.Tensor | None = None
     # Precomputed Python lists to avoid repeated .tolist() in AllToAllCollect
     send_splits: list[int] | None = None
     recv_splits: list[int] | None = None
@@ -273,7 +258,7 @@ def build_gp_context(
 
     Args:
         edge_index: Edge index filtered to edges whose targets are in
-            this rank's partition, shape (2, num_local_edges).
+            this rank's partition, shape (2, num_edges).
             Row 0 = source, row 1 = target.
         rank_assignments: Rank assignment for each atom, shape (total_atoms,).
         rank: This rank's GP rank.
@@ -408,23 +393,9 @@ def build_gp_context(
     # Precompute edge_index_local
     edge_index_local = global_to_local[edge_index]
 
-    # Classify edges: fully-local (both endpoints local) vs boundary
-    # (source is remote). Used for communication-computation overlap.
-    src_is_local = edge_index_local[0] < total_local_atoms
-    tgt_is_local_edge = edge_index_local[1] < total_local_atoms
-    local_edge_mask = src_is_local & tgt_is_local_edge
-
-    # Pre-compute edge reorder permutation: local edges first, boundary
-    # edges last. This enables compile-friendly overlap via split()
-    # instead of boolean indexing. The reorder is applied in the
-    # backbone forward to all per-edge tensors simultaneously.
-    edge_reorder = torch.argsort((~local_edge_mask).to(torch.int32), stable=True)
-
     # Batch ALL GPU→CPU scalar extractions into a single transfer.
-    # This batches send_counts, recv_counts, local_edge_count, AND
-    # validation scalars into ONE .cpu() call, eliminating 2 extra
-    # GPU→CPU syncs from separate .all()/.any() validation checks.
-    local_edge_count = local_edge_mask.sum().unsqueeze(0).to(torch.long)
+    # This batches send_counts, recv_counts, AND validation scalars
+    # into ONE .cpu() call, eliminating extra GPU→CPU syncs.
     bad_edge_count = (edge_index_local < 0).sum().unsqueeze(0).to(torch.long)
     send_valid = (
         torch.ones(1, dtype=torch.long, device=device)
@@ -436,16 +407,12 @@ def build_gp_context(
             .to(torch.long)
         )
     )
-    all_cpu = torch.cat(
-        [send_counts, recv_counts, local_edge_count, bad_edge_count, send_valid]
-    ).cpu()
+    all_cpu = torch.cat([send_counts, recv_counts, bad_edge_count, send_valid]).cpu()
     send_splits = all_cpu[:world_size].tolist()
     recv_splits = all_cpu[world_size : 2 * world_size].tolist()
     total_recv = sum(recv_splits)
-    num_local_edges = int(all_cpu[2 * world_size].item())
-    num_boundary_edges = edge_index_local.shape[1] - num_local_edges
-    n_bad = int(all_cpu[2 * world_size + 1].item())
-    send_ok = int(all_cpu[2 * world_size + 2].item())
+    n_bad = int(all_cpu[2 * world_size].item())
+    send_ok = int(all_cpu[2 * world_size + 1].item())
 
     # Validate AFTER the batched CPU transfer (no extra GPU syncs).
     if not send_ok:
@@ -529,15 +496,11 @@ def build_gp_context(
         total_needed_atoms=total_needed_atoms,
         send_indices=send_indices,
         edge_index_local=edge_index_local,
-        local_edge_mask=local_edge_mask,
-        num_local_edges=num_local_edges,
-        num_boundary_edges=num_boundary_edges,
-        edge_reorder=edge_reorder,
         # Precompute Python lists once (avoids .tolist() per layer per forward)
         send_splits=send_splits,
         recv_splits=recv_splits,
         total_recv=total_recv,
-        # Precompute sparse neighbor lists for P2P communication
+        # Precompute sparse neighbor lists for communication
         send_neighbors=[
             r for r in range(world_size) if send_splits[r] > 0 and r != rank
         ],
diff --git a/tests/core/models/uma/test_graph_parallel.py b/tests/core/models/uma/test_graph_parallel.py
@@ -568,77 +568,6 @@ def test_a2a_spatial_partition():
         )
 
 
-class TestEdgeClassification:
-    """
-    Tests for local_edge_mask precomputation in GPContext.
-    """
-
-    def test_edge_mask_types(self):
-        """
-        Verify that local_edge_mask is computed and has correct type/shape.
-        """
-        # 6 atoms, 2 ranks, edges cross the partition boundary
-        # build_gp_context expects edges pre-filtered to targets in this
-        # rank's partition (atoms 0, 1, 2 for rank 0).
-        rank_assignments = torch.tensor([0, 0, 0, 1, 1, 1])
-        edge_index = torch.tensor(
-            [
-                [0, 1, 2, 3],
-                [1, 2, 0, 0],
-            ]
-        )
-        ctx = build_gp_context(edge_index, rank_assignments, rank=0, world_size=2)
-        assert ctx.local_edge_mask is not None
-        assert ctx.local_edge_mask.dtype == torch.bool
-        assert ctx.local_edge_mask.shape[0] == edge_index.shape[1]
-        assert ctx.num_local_edges is not None
-        assert ctx.num_boundary_edges is not None
-        assert ctx.num_local_edges + ctx.num_boundary_edges == edge_index.shape[1]
-
-    def test_all_local_edges(self):
-        """
-        When all edges are within the local partition, all should be local.
-        """
-        rank_assignments = torch.tensor([0, 0, 0, 1, 1, 1])
-        # All edges within rank 0's partition
-        edge_index = torch.tensor([[0, 1, 2], [1, 2, 0]])
-        ctx = build_gp_context(edge_index, rank_assignments, rank=0, world_size=2)
-        assert ctx.num_local_edges == 3
-        assert ctx.num_boundary_edges == 0
-        assert ctx.local_edge_mask.all()
-
-    def test_all_boundary_edges(self):
-        """
-        When all edges have remote sources, all should be boundary.
-        """
-        rank_assignments = torch.tensor([0, 0, 0, 1, 1, 1])
-        # All edges from rank 1 atoms to rank 0 atoms
-        edge_index = torch.tensor([[3, 4, 5], [0, 1, 2]])
-        ctx = build_gp_context(edge_index, rank_assignments, rank=0, world_size=2)
-        assert ctx.num_local_edges == 0
-        assert ctx.num_boundary_edges == 3
-        assert not ctx.local_edge_mask.any()
-
-    def test_mixed_edges(self):
-        """
-        Verify correct classification of mixed local and boundary edges.
-        """
-        rank_assignments = torch.tensor([0, 0, 0, 1, 1, 1])
-        # 4 edges: 2 local (0->1, 1->2), 2 boundary (3->0, 4->1)
-        edge_index = torch.tensor(
-            [
-                [0, 1, 3, 4],
-                [1, 2, 0, 1],
-            ]
-        )
-        ctx = build_gp_context(edge_index, rank_assignments, rank=0, world_size=2)
-        assert ctx.num_local_edges == 2
-        assert ctx.num_boundary_edges == 2
-        # First 2 edges are local, last 2 are boundary
-        expected_mask = torch.tensor([True, True, False, False])
-        assert torch.equal(ctx.local_edge_mask, expected_mask)
-
-
 # =========================================================================
 # Distributed tests: send_info optimization correctness
 # =========================================================================