Skip to content

Commit 9f3d6d6

Browse files
committed
Remove unused NCCL subgroup support
1 parent 6d1abcf commit 9f3d6d6

File tree

4 files changed

+1
-83
lines changed

4 files changed

+1
-83
lines changed

src/collective/comm.cu

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
8484
rc = std::move(rc) << [&] {
8585
return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_);
8686
} << [&] {
87-
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
88-
// Split subcommunicators inherit behavior from the parent communicator. With a non-blocking
89-
// parent, `ncclCommSplit`/subgroup setup was unstable for the quantile tree path and failed
90-
// with NCCL invalid-argument errors in MGPU tests. Keep the root communicator blocking until
91-
// NCCL subgroup initialization is reliable without it.
92-
config.blocking = 1;
93-
return this->stub_->CommInitRankConfig(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank(),
94-
&config);
87+
return this->stub_->CommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank());
9588
} << [&] {
9689
return BusyWait(this->stub_, this->nccl_comm_, this->Timeout());
9790
};
@@ -103,57 +96,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
10396
}
10497
}
10598

106-
std::size_t NCCLComm::GetSubgroup(std::vector<std::int32_t> const& active_ranks) const {
107-
auto it = std::find_if(this->subgroups_.cbegin(), this->subgroups_.cend(),
108-
[&](auto const& g) { return g.active_ranks == active_ranks; });
109-
if (it != this->subgroups_.cend()) {
110-
return static_cast<std::size_t>(std::distance(this->subgroups_.cbegin(), it));
111-
}
112-
113-
Subgroup g;
114-
g.active_ranks = active_ranks;
115-
g.stream = std::make_shared<curt::Stream>();
116-
117-
auto active =
118-
std::find(active_ranks.cbegin(), active_ranks.cend(), this->Rank()) != active_ranks.cend();
119-
auto color = active ? 1 : NCCL_SPLIT_NOCOLOR;
120-
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
121-
config.blocking = 0;
122-
config.splitShare = 0;
123-
124-
auto rc = this->stub_->CommSplit(this->nccl_comm_, color, this->Rank(), &g.comm, &config);
125-
SafeColl(rc);
126-
if (g.comm != nullptr) {
127-
rc = BusyWait(this->stub_, g.comm, this->Timeout());
128-
SafeColl(rc);
129-
}
130-
131-
this->subgroups_.push_back(std::move(g));
132-
return this->subgroups_.size() - 1;
133-
}
134-
13599
NCCLComm::~NCCLComm() {
136-
for (auto& subgroup : this->subgroups_) {
137-
if (subgroup.comm == nullptr) {
138-
continue;
139-
}
140-
auto rc = Success() << [this, &subgroup] {
141-
return this->stub_->CommFinalize(subgroup.comm);
142-
} << [this, &subgroup] {
143-
auto rc = BusyWait(this->stub_, subgroup.comm, this->Timeout());
144-
if (!rc.OK()) {
145-
return std::move(rc) + this->stub_->CommAbort(subgroup.comm);
146-
}
147-
return rc;
148-
} << [this, &subgroup] {
149-
return this->stub_->CommDestroy(subgroup.comm);
150-
};
151-
if (!rc.OK()) {
152-
LOG(WARNING) << rc.Report();
153-
}
154-
}
155-
this->subgroups_.clear();
156-
157100
if (nccl_comm_) {
158101
auto rc = Success() << [this] {
159102
return this->stub_->CommFinalize(this->nccl_comm_);

src/collective/comm.cuh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include <cstdint> // for int32_t
1111
#include <memory> // for shared_ptr
1212
#include <utility> // for move
13-
#include <vector> // for vector
1413

1514
#include "../common/cuda_stream.h" // for StreamRef
1615
#include "coll.h"
@@ -30,20 +29,12 @@ inline Result GetCUDAResult(cudaError rc) {
3029

3130
#if defined(XGBOOST_USE_NCCL)
3231
class NCCLComm : public Comm {
33-
public:
34-
struct Subgroup {
35-
std::vector<std::int32_t> active_ranks;
36-
ncclComm_t comm{nullptr};
37-
std::shared_ptr<curt::Stream> stream;
38-
};
39-
4032
private:
4133
ncclComm_t nccl_comm_{nullptr};
4234
std::shared_ptr<NcclStub> stub_;
4335
ncclUniqueId nccl_unique_id_{};
4436
curt::StreamRef stream_;
4537
std::string nccl_path_;
46-
mutable std::vector<Subgroup> subgroups_;
4738

4839
public:
4940
[[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
@@ -66,8 +57,6 @@ class NCCLComm : public Comm {
6657
this->ResetState();
6758
return Success();
6859
}
69-
[[nodiscard]] std::size_t GetSubgroup(std::vector<std::int32_t> const& active_ranks) const;
70-
[[nodiscard]] Subgroup const& SubgroupAt(std::size_t idx) const { return subgroups_.at(idx); }
7160
};
7261

7362
class NCCLChannel : public Channel {

src/collective/nccl_stub.cc

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,10 @@ no long bundles NCCL in the binary wheel.
9494
broadcast_ = safe_load(broadcast_, "ncclBroadcast");
9595
allgather_ = safe_load(allgather_, "ncclAllGather");
9696
comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
97-
comm_init_rank_config_ = safe_load(comm_init_rank_config_, "ncclCommInitRankConfig");
9897
comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
9998
comm_finalize_ = safe_load(comm_finalize_, "ncclCommFinalize");
10099
comm_get_async_error_ = safe_load(comm_get_async_error_, "ncclCommGetAsyncError");
101100
comm_abort_ = safe_load(comm_abort_, "ncclCommAbort");
102-
comm_split_ = safe_load(comm_split_, "ncclCommSplit");
103101
get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
104102
send_ = safe_load(send_, "ncclSend");
105103
recv_ = safe_load(recv_, "ncclRecv");
@@ -112,12 +110,10 @@ no long bundles NCCL in the binary wheel.
112110
broadcast_ = ncclBroadcast;
113111
allgather_ = ncclAllGather;
114112
comm_init_rank_ = ncclCommInitRank;
115-
comm_init_rank_config_ = ncclCommInitRankConfig;
116113
comm_destroy_ = ncclCommDestroy;
117114
comm_finalize_ = ncclCommFinalize;
118115
comm_get_async_error_ = ncclCommGetAsyncError;
119116
comm_abort_ = ncclCommAbort;
120-
comm_split_ = ncclCommSplit;
121117
get_uniqueid_ = ncclGetUniqueId;
122118
send_ = ncclSend;
123119
recv_ = ncclRecv;

src/collective/nccl_stub.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,10 @@ class NcclStub {
2828
decltype(ncclBroadcast)* broadcast_{nullptr};
2929
decltype(ncclAllGather)* allgather_{nullptr};
3030
decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
31-
decltype(ncclCommInitRankConfig)* comm_init_rank_config_{nullptr};
3231
decltype(ncclCommDestroy)* comm_destroy_{nullptr};
3332
decltype(ncclCommFinalize)* comm_finalize_{nullptr};
3433
decltype(ncclCommGetAsyncError)* comm_get_async_error_{nullptr};
3534
decltype(ncclCommAbort)* comm_abort_{nullptr};
36-
decltype(ncclCommSplit)* comm_split_{nullptr};
3735
decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
3836
decltype(ncclSend)* send_{nullptr};
3937
decltype(ncclRecv)* recv_{nullptr};
@@ -68,10 +66,6 @@ class NcclStub {
6866
int rank) const {
6967
return this->GetNcclResult(this->comm_init_rank_(comm, nranks, commId, rank));
7068
}
71-
[[nodiscard]] Result CommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId,
72-
int rank, ncclConfig_t* config) const {
73-
return this->GetNcclResult(this->comm_init_rank_config_(comm, nranks, commId, rank, config));
74-
}
7569
[[nodiscard]] Result CommDestroy(ncclComm_t comm) const {
7670
if (this->Aborted()) {
7771
return Success();
@@ -100,10 +94,6 @@ class NcclStub {
10094
this->aborted_ = true;
10195
return this->GetNcclResult(comm_abort_(comm));
10296
}
103-
[[nodiscard]] Result CommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
104-
ncclConfig_t* config) const {
105-
return this->GetNcclResult(comm_split_(comm, color, key, newcomm, config));
106-
}
10797
[[nodiscard]] Result GetUniqueId(ncclUniqueId* uniqueId) const {
10898
return this->GetNcclResult(get_uniqueid_(uniqueId));
10999
}

0 commit comments

Comments
 (0)