@@ -20,6 +20,34 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
2020 ss << " Memory allocation error on worker " << rank << " : " << err << " \n "
2121 << " - Free memory: " << HumanMemUnit (dh::AvailableMemory (device)) << " \n "
2222 << " - Requested memory: " << HumanMemUnit (bytes) << std::endl;
23+
24+ cudaMemPool_t memPool;
25+ std::size_t reserved_bytes = 0 ;
26+ std::size_t used_bytes = 0 ;
27+
28+ // Get the default memory pool for the current device
29+ auto status = cudaDeviceGetDefaultMemPool (&memPool, ::xgboost::curt::CurrentDevice ());
30+ if (status != cudaSuccess) {
31+ ss << " Failed to get default memory pool: " << cudaGetErrorString (status) << " \n " ;
32+ LOG (FATAL) << ss.str ();
33+ }
34+
35+ // Get the current total reserved memory size
36+ status = cudaMemPoolGetAttribute (memPool, cudaMemPoolAttrReservedMemCurrent, &reserved_bytes);
37+ if (status != cudaSuccess) {
38+ ss << " Failed to get reserved memory attribute: " << cudaGetErrorString (status) << " \n " ;
39+ LOG (FATAL) << ss.str ();
40+ }
41+ ss << " - Reserved by pool:" << HumanMemUnit (reserved_bytes) << " \n " ;
42+
43+ // Get the current total used memory size
44+ status = cudaMemPoolGetAttribute (memPool, cudaMemPoolAttrUsedMemCurrent, &used_bytes);
45+ if (status != cudaSuccess) {
46+ ss << " Failed to get used memory attribute: " << cudaGetErrorString (status) << " \n " ;
47+ LOG (FATAL) << ss.str ();
48+ }
49+
50+ ss << " - Used by pool:" << HumanMemUnit (used_bytes) << " \n " ;
2351 LOG (FATAL) << ss.str ();
2452}
2553
0 commit comments