Skip to content

Commit 4947c01

Browse files
committed
Get CUDA memory pool stats after running into OOM.
Log the CUDA memory pool usage, in addition to normal CUDA memory.
1 parent 2eaa90a commit 4947c01

File tree

1 file changed

+28
-0
lines changed

1 file changed

+28
-0
lines changed

src/common/device_vector.cu

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,34 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
2020
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
2121
<< "- Free memory: " << HumanMemUnit(dh::AvailableMemory(device)) << "\n"
2222
<< "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
23+
24+
cudaMemPool_t memPool;
25+
std::size_t reserved_bytes = 0;
26+
std::size_t used_bytes = 0;
27+
28+
// Get the default memory pool for the current device
29+
auto status = cudaDeviceGetDefaultMemPool(&memPool, ::xgboost::curt::CurrentDevice());
30+
if (status != cudaSuccess) {
31+
ss << "Failed to get default memory pool: " << cudaGetErrorString(status) << "\n";
32+
LOG(FATAL) << ss.str();
33+
}
34+
35+
// Get the current total reserved memory size
36+
status = cudaMemPoolGetAttribute(memPool, cudaMemPoolAttrReservedMemCurrent, &reserved_bytes);
37+
if (status != cudaSuccess) {
38+
ss << "Failed to get reserved memory attribute: " << cudaGetErrorString(status) << "\n";
39+
LOG(FATAL) << ss.str();
40+
}
41+
ss << "- Reserved by pool:" << HumanMemUnit(reserved_bytes) << "\n";
42+
43+
// Get the current total used memory size
44+
status = cudaMemPoolGetAttribute(memPool, cudaMemPoolAttrUsedMemCurrent, &used_bytes);
45+
if (status != cudaSuccess) {
46+
ss << "Failed to get used memory attribute: " << cudaGetErrorString(status) << "\n";
47+
LOG(FATAL) << ss.str();
48+
}
49+
50+
ss << "- Used by pool:" << HumanMemUnit(used_bytes) << "\n";
2351
LOG(FATAL) << ss.str();
2452
}
2553

0 commit comments

Comments
 (0)