Skip to content

Commit 2da4a05

Browse files
committed
Changes to enable threadsafe operation
1 parent 2f497b9 commit 2da4a05

12 files changed

Lines changed: 337 additions & 44 deletions

File tree

dynet/aligned-mem-pool.cc

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,25 @@
44

55
using namespace dynet;
66

7+
8+
void DynamicCPUMemoryPool::zero(void* p, size_t n) {
9+
auto rounded_n = a->round_up_align(n);
10+
a->zero(p, rounded_n);
11+
}
12+
13+
void* DynamicCPUMemoryPool::allocate(size_t n) {
14+
auto rounded_n = a->round_up_align(n);
15+
void* res = a->malloc(rounded_n);
16+
if (res) {
17+
ptrs.push_back(res);
18+
sizes.push_back(rounded_n);
19+
}
20+
return res;
21+
}
22+
23+
void DynamicCPUMemoryPool::sys_alloc(size_t cap) {}
24+
25+
726
void* InternalMemoryPool::allocate(size_t n) {
827
auto rounded_n = a->round_up_align(n);
928
if (rounded_n + used > capacity) {
@@ -22,9 +41,13 @@ void InternalMemoryPool::sys_alloc(size_t cap) {
2241
used = 0;
2342
}
2443

25-
AlignedMemoryPool::AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit) : name(name), cap(initial_cap), current(0), a(a), expanding_unit(expanding_unit) {
44+
AlignedMemoryPool::AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit, bool dynamic) : name(name), cap(initial_cap), current(0), a(a), expanding_unit(expanding_unit), dynamic(dynamic) {
2645
DYNET_ARG_CHECK(cap > 0, "Attempt to allocate memory of size 0 in AlignedMemoryPool");
27-
pools.push_back(new InternalMemoryPool(name, cap, a));
46+
if (dynamic) {
47+
pools.push_back(new DynamicCPUMemoryPool(name, cap));
48+
} else {
49+
pools.push_back(new InternalMemoryPool(name, cap, a));
50+
}
2851
}
2952
AlignedMemoryPool::~AlignedMemoryPool() {
3053
for ( auto p : pools) { delete p; }
@@ -35,7 +58,11 @@ void* AlignedMemoryPool::allocate(size_t n) {
3558
if (res == 0) {
3659
// round up to the nearest multiple of expanding_unit
3760
size_t new_pool_size = (n + expanding_unit-1) / expanding_unit * expanding_unit;
38-
pools.push_back(new InternalMemoryPool(name, new_pool_size, a));
61+
if (dynamic) {
62+
pools.push_back(new DynamicCPUMemoryPool(name, new_pool_size));
63+
} else {
64+
pools.push_back(new InternalMemoryPool(name, new_pool_size, a));
65+
}
3966
cap += new_pool_size;
4067
current++;
4168
res = pools[current]->allocate(n);
@@ -47,7 +74,12 @@ void AlignedMemoryPool::free() {
4774
if (current > 0) {
4875
for (auto p : pools) { delete p; }
4976
pools.clear();
50-
pools.push_back(new InternalMemoryPool(name, cap, a));
77+
if (dynamic) {
78+
pools.push_back(new DynamicCPUMemoryPool(name, cap));
79+
} else {
80+
pools.push_back(new InternalMemoryPool(name, cap, a));
81+
}
82+
cap = cap * (current + 1);
5183
current = 0;
5284
}
5385
pools[0]->free();

dynet/aligned-mem-pool.h

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,64 @@
88

99
namespace dynet {
1010

11-
class InternalMemoryPool {
11+
class BaseMemoryPool {
1212
public:
13-
explicit InternalMemoryPool(const std::string & name, size_t cap, MemAllocator* a) : name(name), a(a) {
13+
BaseMemoryPool(const std::string & name, MemAllocator* a) : name(name), a(a) {}
14+
virtual ~BaseMemoryPool() {}
15+
virtual void* allocate(size_t n) = 0;
16+
17+
virtual void free() = 0;
18+
// zeros out the amount of allocations
19+
virtual void zero_allocated_memory() = 0;
20+
21+
size_t used;
22+
23+
protected:
24+
virtual void sys_alloc(size_t cap) {}
25+
virtual void zero_all() {}
26+
27+
MemAllocator* a;
28+
std::string name;
29+
void* mem;
30+
};
31+
32+
class DynamicCPUMemoryPool : public BaseMemoryPool {
33+
private:
34+
std::vector<void*> ptrs;
35+
std::vector<size_t> sizes;
36+
37+
public:
38+
explicit DynamicCPUMemoryPool(const std::string & name, size_t cap)
39+
: BaseMemoryPool(name, new CPUAllocator()) {}
40+
41+
~DynamicCPUMemoryPool() {
42+
free();
43+
delete a;
44+
}
45+
46+
void* allocate(size_t n);
47+
void zero(void* p, size_t n);
48+
49+
void free() {
50+
for (auto p : ptrs)
51+
a->free(p);
52+
ptrs.clear();
53+
sizes.clear();
54+
}
55+
// zeros out the amount of allocations
56+
void zero_allocated_memory() {
57+
for (unsigned i = 0; i < ptrs.size(); i++)
58+
zero(ptrs[i], sizes[i]);
59+
}
60+
61+
private:
62+
void sys_alloc(size_t cap);
63+
void zero_all() {}
64+
};
65+
66+
class InternalMemoryPool : public BaseMemoryPool {
67+
public:
68+
explicit InternalMemoryPool(const std::string & name, size_t cap, MemAllocator* a) : BaseMemoryPool(name, a) {
1469
sys_alloc(cap);
1570
zero_all();
1671
}
@@ -33,20 +88,18 @@ class InternalMemoryPool {
3388

3489
size_t used;
3590
private:
91+
size_t capacity;
92+
3693
void sys_alloc(size_t cap);
3794

3895
void zero_all() {
3996
a->zero(mem, capacity);
4097
}
41-
std::string name;
42-
size_t capacity;
43-
MemAllocator* a;
44-
void* mem;
4598
};
4699

47100
class AlignedMemoryPool {
48101
public:
49-
explicit AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit = 1<<24);
102+
explicit AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit = 1<<24, bool dynamic = false);
50103
~AlignedMemoryPool();
51104

52105
void* allocate(size_t n);
@@ -58,13 +111,18 @@ class AlignedMemoryPool {
58111
size_t used();
59112
void set_used(size_t s);
60113

114+
size_t round_up_align(size_t n) const { return a->round_up_align(n); }
115+
116+
bool is_dynamic() { return dynamic; }
117+
61118
private:
62119
std::string name;
63-
std::vector<InternalMemoryPool *> pools;
120+
std::vector<BaseMemoryPool *> pools;
64121
size_t cap;
65122
int current;
66123
MemAllocator* a;
67124
size_t expanding_unit;
125+
bool dynamic;
68126
};
69127

70128
} // namespace dynet

dynet/devices.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ Device_GPU::Device_GPU(int my_id, const DeviceMempoolSizes & mbs, int device_id)
122122
Device_GPU::~Device_GPU() {}
123123
#endif
124124

125-
Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared) :
125+
Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared, bool dynamic) :
126126
Device(my_id, DeviceType::CPU, &cpu_mem), shmem(mem) {
127127
if (shared) shmem = new SharedAllocator();
128128
kSCALAR_MINUSONE = (float*) mem->malloc(sizeof(float));
@@ -137,10 +137,11 @@ Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared) :
137137
edevice = new Eigen::DefaultDevice;
138138

139139
// this is the big memory allocation.
140-
pools[0] = new AlignedMemoryPool("CPU forward memory", (mbs.used[0] << 20), &cpu_mem);
141-
pools[1] = new AlignedMemoryPool("CPU backward memory", (mbs.used[1] << 20), &cpu_mem);
142-
pools[2] = new AlignedMemoryPool("CPU parameter memory", (mbs.used[2] << 20), shmem);
143-
pools[3] = new AlignedMemoryPool("CPU scratch memory", (mbs.used[3] << 20), &cpu_mem);
140+
const size_t initial = 1<<24;
141+
pools[0] = new AlignedMemoryPool("CPU forward memory", (mbs.used[0] << 20), &cpu_mem, initial, dynamic);
142+
pools[1] = new AlignedMemoryPool("CPU backward memory", (mbs.used[1] << 20), &cpu_mem, initial, dynamic);
143+
pools[2] = new AlignedMemoryPool("CPU parameter memory", (mbs.used[2] << 20), shmem, initial, dynamic);
144+
pools[3] = new AlignedMemoryPool("CPU scratch memory", (mbs.used[3] << 20), &cpu_mem, initial, dynamic);
144145
}
145146

146147
Device_CPU::~Device_CPU() {}

dynet/devices.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ class Device_GPU : public Device {
163163
class Device_CPU : public Device {
164164
public:
165165
typedef Eigen::DefaultDevice EigenDevice;
166-
explicit Device_CPU(int my_id, const DeviceMempoolSizes & mb, bool shared);
166+
explicit Device_CPU(int my_id, const DeviceMempoolSizes & mb, bool shared, bool dynamic);
167167
~Device_CPU();
168168
CPUAllocator cpu_mem;
169169
Eigen::DefaultDevice* edevice;

dynet/dynet.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ ComputationGraph::ComputationGraph() {
111111
} else {
112112
ee.reset(new SimpleExecutionEngine(*this));
113113
}
114-
if (n_hgs > 0) {
114+
if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {
115115
cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";
116116
throw std::runtime_error("Attempted to create >1 CG");
117117
}
@@ -128,7 +128,7 @@ ComputationGraph::ComputationGraph(bool batched) {
128128
} else {
129129
ee.reset(new SimpleExecutionEngine(*this));
130130
}
131-
if (n_hgs > 0) {
131+
if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {
132132
cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";
133133
throw std::runtime_error("Attempted to create >1 CG");
134134
}

dynet/exec.cc

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,15 @@ const Tensor& SimpleExecutionEngine::incremental_forward(VariableIndex i) {
9292
string current_node_name; // Optionally used for debugging (reused).
9393
vector<const Tensor*> xs(16); // Container for arguments to nodes (reused).
9494

95+
unsigned size = 0;
96+
void* begin;
97+
for (unsigned j = num_nodes_evaluated; j <= i; ++j) {
98+
const Node* node = cg.nodes[j];
99+
auto rounded_n = pool_fxs->round_up_align(node->dim.size() * sizeof(float));
100+
size += rounded_n;
101+
}
102+
begin = pool_fxs->allocate(size);
103+
95104
for (; num_nodes_evaluated <= i; ++num_nodes_evaluated) {
96105
const Node* node = cg.nodes[num_nodes_evaluated];
97106
if (autobatch_debug_flag) {
@@ -116,19 +125,19 @@ const Tensor& SimpleExecutionEngine::incremental_forward(VariableIndex i) {
116125
"SimpleExecutionEngine::incremental_forward");
117126
node_fx.device = node->device;
118127
node_fx.mem_pool = DeviceMempool::FXS;
119-
// Get the memory to store f(xs)
120-
auto& node_fx_pools = node_fx.device->pools;
121-
node_fx.v = static_cast<float*>(
122-
node_fx_pools[(int)DeviceMempool::FXS]->allocate(
123-
node->dim.size() * sizeof(float)));
128+
// Get the memory
129+
node_fx.v = static_cast<float*>(begin);
130+
auto rounded_n = pool_fxs->round_up_align(node->dim.size() * sizeof(float));
131+
begin += rounded_n;
132+
124133
if (node_fx.v == nullptr)
125134
DYNET_RUNTIME_ERR("Ran out of memory when executing node " <<
126135
num_nodes_evaluated);
127136
void* aux_mem = nullptr;
128137
// Is the node requesting extra memory?
129138
size_t aux_size = node->aux_storage_size();
130139
if (aux_size) {
131-
aux_mem = node_fx_pools[(int)DeviceMempool::FXS]->allocate(aux_size);
140+
aux_mem = pool_fxs->allocate(aux_size);
132141
if (aux_mem == nullptr)
133142
DYNET_RUNTIME_ERR("Ran out of auxiliary memory when executing node "
134143
<< num_nodes_evaluated);
@@ -161,30 +170,37 @@ void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
161170

162171
const unsigned num_nodes = from_where + 1;
163172
ndEdfs.resize(num_nodes);
164-
const vector<Device*> &devices = device_manager->get_devices();
165-
for(Device* device : devices)
166-
device->pools[(int)DeviceMempool::DEDFS]->free();
173+
pool_dEdfs->free();
167174

168175
// This loop allocates memory on the appropriate devices for the nodes whose
169176
// derivatives will be computed.
177+
// This assumes all of these use the same device!
178+
unsigned size = 0;
179+
void* begin;
180+
for (unsigned i = 0; i < num_nodes; ++i) {
181+
const Node* node = cg.nodes[i];
182+
auto rounded_n = pool_dEdfs->round_up_align(node->dim.size() * sizeof(float));
183+
size += rounded_n;
184+
}
185+
begin = pool_dEdfs->allocate(size);
186+
pool_dEdfs->zero_allocated_memory();
187+
170188
for (unsigned i = 0; i < num_nodes; ++i) {
171189
const auto dim = nfxs[i].d;
172190
auto& node_dEdfx = ndEdfs[i];
173191
node_dEdfx.d = dim;
174192
node_dEdfx.device = nfxs[i].device;
175193
node_dEdfx.mem_pool = DeviceMempool::DEDFS;
176-
node_dEdfx.v = static_cast<float*>(
177-
node_dEdfx.device->pools[(int)DeviceMempool::DEDFS]->allocate(
178-
dim.size() * sizeof(float)));
194+
node_dEdfx.v = static_cast<float*>(begin);
195+
auto rounded_n = pool_dEdfs->round_up_align(dim.size() * sizeof(float));
196+
begin += rounded_n;
197+
179198
if (node_dEdfx.v == nullptr) {
180199
DYNET_RUNTIME_ERR(
181200
"out of memory while attempting to allocate space for "
182201
"derivatives of node " << i);
183202
}
184203
}
185-
// Zero all derivative memory (which is contiguous on each device)
186-
for (Device* device : devices)
187-
device->pools[(int)DeviceMempool::DEDFS]->zero_allocated_memory();
188204

189205
// initialize dE/dE = 1
190206
ndEdfs.back().v = cg.nodes.back()->device->kSCALAR_ONE;

dynet/exec.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#define DYNET_EXEC_H
33

44
#include "dynet/dynet.h"
5+
#include "dynet/aligned-mem-pool.h"
6+
#include <memory>
57

68
namespace dynet {
79

@@ -24,6 +26,10 @@ class ExecutionEngine {
2426
virtual const Tensor& get_gradient(VariableIndex i) = 0;
2527
virtual void backward(bool full = false) = 0;
2628
virtual void backward(VariableIndex i, bool full = false) = 0;
29+
AlignedMemoryPool* pool_fxs;
30+
AlignedMemoryPool* pool_dEdfs;
31+
MemAllocator* mem = nullptr;
32+
2733
protected:
2834
explicit ExecutionEngine(const ComputationGraph& cg);
2935
DeviceManager* const device_manager;
@@ -34,7 +40,25 @@ class ExecutionEngine {
3440
class SimpleExecutionEngine : public ExecutionEngine {
3541
public:
3642
explicit SimpleExecutionEngine(const ComputationGraph& cg) :
37-
ExecutionEngine(cg), num_nodes_evaluated(0) {}
43+
ExecutionEngine(cg), num_nodes_evaluated(0) {
44+
if (default_device->pools[0]->is_dynamic()) {
45+
mem = new CPUAllocator();
46+
pool_fxs = new AlignedMemoryPool("CPU forward memory", 1 << 24, mem, 1 << 24, true);
47+
pool_dEdfs = new AlignedMemoryPool("CPU backward memory", 1 << 24, mem, 1 << 24, true);
48+
} else {
49+
pool_fxs = default_device->pools[(int)DeviceMempool::FXS];
50+
pool_dEdfs = default_device->pools[(int)DeviceMempool::DEDFS];
51+
}
52+
}
53+
54+
~SimpleExecutionEngine() {
55+
if (default_device->pools[0]->is_dynamic()) {
56+
delete pool_fxs;
57+
delete pool_dEdfs;
58+
delete mem;
59+
}
60+
}
61+
3862
void invalidate() override;
3963
void invalidate(unsigned i) override;
4064
const Tensor& forward() override;

0 commit comments

Comments
 (0)