@@ -84,14 +84,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
8484 rc = std::move (rc) << [&] {
8585 return GetUniqueId (root, this ->stub_ , pimpl, &nccl_unique_id_);
8686 } << [&] {
87- ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
88- // Split subcommunicators inherit behavior from the parent communicator. With a non-blocking
89- // parent, `ncclCommSplit`/subgroup setup was unstable for the quantile tree path and failed
90- // with NCCL invalid-argument errors in MGPU tests. Keep the root communicator blocking until
91- // NCCL subgroup initialization is reliable without it.
92- config.blocking = 1 ;
93- return this ->stub_ ->CommInitRankConfig (&nccl_comm_, root.World (), nccl_unique_id_, root.Rank (),
94- &config);
87+ return this ->stub_ ->CommInitRank (&nccl_comm_, root.World (), nccl_unique_id_, root.Rank ());
9588 } << [&] {
9689 return BusyWait (this ->stub_ , this ->nccl_comm_ , this ->Timeout ());
9790 };
@@ -103,57 +96,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
10396 }
10497}
10598
106- std::size_t NCCLComm::GetSubgroup (std::vector<std::int32_t > const & active_ranks) const {
107- auto it = std::find_if (this ->subgroups_ .cbegin (), this ->subgroups_ .cend (),
108- [&](auto const & g) { return g.active_ranks == active_ranks; });
109- if (it != this ->subgroups_ .cend ()) {
110- return static_cast <std::size_t >(std::distance (this ->subgroups_ .cbegin (), it));
111- }
112-
113- Subgroup g;
114- g.active_ranks = active_ranks;
115- g.stream = std::make_shared<curt::Stream>();
116-
117- auto active =
118- std::find (active_ranks.cbegin (), active_ranks.cend (), this ->Rank ()) != active_ranks.cend ();
119- auto color = active ? 1 : NCCL_SPLIT_NOCOLOR;
120- ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
121- config.blocking = 0 ;
122- config.splitShare = 0 ;
123-
124- auto rc = this ->stub_ ->CommSplit (this ->nccl_comm_ , color, this ->Rank (), &g.comm , &config);
125- SafeColl (rc);
126- if (g.comm != nullptr ) {
127- rc = BusyWait (this ->stub_ , g.comm , this ->Timeout ());
128- SafeColl (rc);
129- }
130-
131- this ->subgroups_ .push_back (std::move (g));
132- return this ->subgroups_ .size () - 1 ;
133- }
134-
13599NCCLComm::~NCCLComm () {
136- for (auto & subgroup : this ->subgroups_ ) {
137- if (subgroup.comm == nullptr ) {
138- continue ;
139- }
140- auto rc = Success () << [this , &subgroup] {
141- return this ->stub_ ->CommFinalize (subgroup.comm );
142- } << [this , &subgroup] {
143- auto rc = BusyWait (this ->stub_ , subgroup.comm , this ->Timeout ());
144- if (!rc.OK ()) {
145- return std::move (rc) + this ->stub_ ->CommAbort (subgroup.comm );
146- }
147- return rc;
148- } << [this , &subgroup] {
149- return this ->stub_ ->CommDestroy (subgroup.comm );
150- };
151- if (!rc.OK ()) {
152- LOG (WARNING) << rc.Report ();
153- }
154- }
155- this ->subgroups_ .clear ();
156-
157100 if (nccl_comm_) {
158101 auto rc = Success () << [this ] {
159102 return this ->stub_ ->CommFinalize (this ->nccl_comm_ );
0 commit comments