clab
diff --git a/‎dynet/aligned-mem-pool.cc‎
Lines changed: 36 additions & 4 deletions b/‎dynet/aligned-mem-pool.cc‎
Lines changed: 36 additions & 4 deletions
diff --git a/‎dynet/aligned-mem-pool.h‎
Lines changed: 66 additions & 8 deletions b/‎dynet/aligned-mem-pool.h‎
Lines changed: 66 additions & 8 deletions
diff --git a/‎dynet/devices.cc‎
Lines changed: 6 additions & 5 deletions b/‎dynet/devices.cc‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎dynet/devices.h‎
Lines changed: 1 addition & 1 deletion b/‎dynet/devices.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dynet/dynet.cc‎
Lines changed: 2 additions & 2 deletions b/‎dynet/dynet.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dynet/exec.cc‎
Lines changed: 31 additions & 15 deletions b/‎dynet/exec.cc‎
Lines changed: 31 additions & 15 deletions
diff --git a/‎dynet/exec.h‎
Lines changed: 25 additions & 1 deletion b/‎dynet/exec.h‎
Lines changed: 25 additions & 1 deletion
@@ -4,6 +4,25 @@
 
 using namespace dynet;
 
+
+void DynamicCPUMemoryPool::zero(void* p, size_t n) {
+  auto rounded_n = a->round_up_align(n);
+  a->zero(p, rounded_n);
+}
+
+void* DynamicCPUMemoryPool::allocate(size_t n) {
+  auto rounded_n = a->round_up_align(n);
+  void* res = a->malloc(rounded_n);
+  if (res) {
+    ptrs.push_back(res);
+    sizes.push_back(rounded_n);
+  }
+  return res;
+}
+
+void DynamicCPUMemoryPool::sys_alloc(size_t cap) {}
+
+
 void* InternalMemoryPool::allocate(size_t n) {
   auto rounded_n = a->round_up_align(n);
   if (rounded_n + used > capacity) {
@@ -22,9 +41,13 @@ void InternalMemoryPool::sys_alloc(size_t cap) {
   used = 0;
 }
 
-AlignedMemoryPool::AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit) : name(name), cap(initial_cap), current(0), a(a), expanding_unit(expanding_unit) {
+AlignedMemoryPool::AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit, bool dynamic) : name(name), cap(initial_cap), current(0), a(a), expanding_unit(expanding_unit), dynamic(dynamic) {
   DYNET_ARG_CHECK(cap > 0, "Attempt to allocate memory of size 0 in AlignedMemoryPool");
-  pools.push_back(new InternalMemoryPool(name, cap, a));
+  if (dynamic) {
+    pools.push_back(new DynamicCPUMemoryPool(name, cap));
+  } else {
+    pools.push_back(new InternalMemoryPool(name, cap, a));
+  }
 }
 AlignedMemoryPool::~AlignedMemoryPool() {
   for ( auto p : pools) { delete p; }
@@ -35,7 +58,11 @@ void* AlignedMemoryPool::allocate(size_t n) {
   if (res == 0) {
     // round up to the nearest multiple of expanding_unit
     size_t new_pool_size  = (n + expanding_unit-1) / expanding_unit * expanding_unit;
-    pools.push_back(new InternalMemoryPool(name, new_pool_size, a));
+    if (dynamic) {
+      pools.push_back(new DynamicCPUMemoryPool(name, new_pool_size));
+    } else {
+      pools.push_back(new InternalMemoryPool(name, new_pool_size, a));
+    }
     cap += new_pool_size;
     current++;
     res = pools[current]->allocate(n);
@@ -47,7 +74,12 @@ void AlignedMemoryPool::free() {
   if (current > 0) {
     for (auto p : pools) { delete p; }
     pools.clear();
-    pools.push_back(new InternalMemoryPool(name, cap, a));
+    if (dynamic) {
+      pools.push_back(new DynamicCPUMemoryPool(name, cap));
+    } else {
+      pools.push_back(new InternalMemoryPool(name, cap, a));
+    }
+    cap = cap * (current + 1);
     current = 0;
   }
   pools[0]->free();
 
@@ -8,9 +8,64 @@
 
 namespace dynet {
 
-class InternalMemoryPool {
+class BaseMemoryPool {
  public:
-  explicit InternalMemoryPool(const std::string & name, size_t cap, MemAllocator* a) : name(name), a(a) {
+  BaseMemoryPool(const std::string & name, MemAllocator* a) : name(name), a(a) {}
+  virtual ~BaseMemoryPool() {}
+  virtual void* allocate(size_t n) = 0; 
+
+  virtual void free() = 0;
+  // zeros out the amount of allocations
+  virtual void zero_allocated_memory() = 0;
+
+  size_t used;
+
+ protected:
+  virtual void sys_alloc(size_t cap) {}
+  virtual void zero_all() {}
+
+  MemAllocator* a;
+  std::string name;
+  void* mem;
+};
+
+class DynamicCPUMemoryPool : public BaseMemoryPool {
+ private:
+  std::vector<void*> ptrs;
+  std::vector<size_t> sizes;
+
+ public:
+  explicit DynamicCPUMemoryPool(const std::string & name, size_t cap)
+    : BaseMemoryPool(name, new CPUAllocator()) {}
+
+  ~DynamicCPUMemoryPool() {
+      free();
+      delete a;
+  }
+
+  void* allocate(size_t n); 
+  void zero(void* p, size_t n); 
+
+  void free() {
+    for (auto p : ptrs)
+      a->free(p);
+    ptrs.clear();
+    sizes.clear();
+  }
+  // zeros out the amount of allocations
+  void zero_allocated_memory() {
+    for (unsigned i = 0; i < ptrs.size(); i++)
+      zero(ptrs[i], sizes[i]);
+  }
+
+ private:
+  void sys_alloc(size_t cap);
+  void zero_all() {}
+};
+
+class InternalMemoryPool : public BaseMemoryPool {
+ public:
+  explicit InternalMemoryPool(const std::string & name, size_t cap, MemAllocator* a) : BaseMemoryPool(name, a) {
     sys_alloc(cap);
     zero_all();
   }
@@ -33,20 +88,18 @@ class InternalMemoryPool {
 
   size_t used;
  private:
+  size_t capacity;
+
   void sys_alloc(size_t cap);
 
   void zero_all() {
     a->zero(mem, capacity);
   }
-  std::string name;
-  size_t capacity;
-  MemAllocator* a;
-  void* mem;
 };
 
 class AlignedMemoryPool {
   public:
-    explicit AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit = 1<<24);
+    explicit AlignedMemoryPool(const std::string &name, size_t initial_cap, MemAllocator *a, size_t expanding_unit = 1<<24, bool dynamic = false);
     ~AlignedMemoryPool();
 
     void* allocate(size_t n);
@@ -58,13 +111,18 @@ class AlignedMemoryPool {
     size_t used();
     void set_used(size_t s);
 
+    size_t round_up_align(size_t n) const { return a->round_up_align(n); }
+
+    bool is_dynamic() { return dynamic; }
+
   private:
     std::string name;
-    std::vector<InternalMemoryPool *> pools;
+    std::vector<BaseMemoryPool *> pools;
     size_t cap;
     int current;
     MemAllocator* a;
     size_t expanding_unit;
+    bool dynamic;
 };
 
 } // namespace dynet
 
@@ -122,7 +122,7 @@ Device_GPU::Device_GPU(int my_id, const DeviceMempoolSizes & mbs, int device_id)
 Device_GPU::~Device_GPU() {}
 #endif
 
-Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared) :
+Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared, bool dynamic) :
   Device(my_id, DeviceType::CPU, &cpu_mem), shmem(mem) {
   if (shared) shmem = new SharedAllocator();
   kSCALAR_MINUSONE = (float*) mem->malloc(sizeof(float));
@@ -137,10 +137,11 @@ Device_CPU::Device_CPU(int my_id, const DeviceMempoolSizes & mbs, bool shared) :
   edevice = new Eigen::DefaultDevice;
 
   // this is the big memory allocation.
-  pools[0] = new AlignedMemoryPool("CPU forward memory", (mbs.used[0] << 20), &cpu_mem);
-  pools[1] = new AlignedMemoryPool("CPU backward memory", (mbs.used[1] << 20), &cpu_mem);
-  pools[2] = new AlignedMemoryPool("CPU parameter memory", (mbs.used[2] << 20), shmem);
-  pools[3] = new AlignedMemoryPool("CPU scratch memory", (mbs.used[3] << 20), &cpu_mem);
+  const size_t initial = 1<<24;
+  pools[0] = new AlignedMemoryPool("CPU forward memory",   (mbs.used[0] << 20), &cpu_mem, initial, dynamic);
+  pools[1] = new AlignedMemoryPool("CPU backward memory",  (mbs.used[1] << 20), &cpu_mem, initial, dynamic);
+  pools[2] = new AlignedMemoryPool("CPU parameter memory", (mbs.used[2] << 20), shmem,    initial, dynamic);
+  pools[3] = new AlignedMemoryPool("CPU scratch memory",   (mbs.used[3] << 20), &cpu_mem, initial, dynamic);
 }
 
 Device_CPU::~Device_CPU() {}
 
@@ -163,7 +163,7 @@ class Device_GPU : public Device {
 class Device_CPU : public Device {
  public:
   typedef Eigen::DefaultDevice EigenDevice;
-  explicit Device_CPU(int my_id, const DeviceMempoolSizes & mb, bool shared);
+  explicit Device_CPU(int my_id, const DeviceMempoolSizes & mb, bool shared, bool dynamic);
   ~Device_CPU();
   CPUAllocator cpu_mem;
   Eigen::DefaultDevice* edevice;
 
@@ -111,7 +111,7 @@ ComputationGraph::ComputationGraph() {
   } else {
     ee.reset(new SimpleExecutionEngine(*this));
   }
-  if (n_hgs > 0) {
+  if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {
     cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";
     throw std::runtime_error("Attempted to create >1 CG");
   }
@@ -128,7 +128,7 @@ ComputationGraph::ComputationGraph(bool batched) {
   } else {
     ee.reset(new SimpleExecutionEngine(*this));
   }
-  if (n_hgs > 0) {
+  if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {
     cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";
     throw std::runtime_error("Attempted to create >1 CG");
   }
 
@@ -92,6 +92,15 @@ const Tensor& SimpleExecutionEngine::incremental_forward(VariableIndex i) {
     string current_node_name;  // Optionally used for debugging (reused).
     vector<const Tensor*> xs(16);  // Container for arguments to nodes (reused).
 
+    unsigned size = 0;
+    void* begin;
+    for (unsigned j = num_nodes_evaluated; j <= i; ++j) {
+      const Node* node = cg.nodes[j];
+      auto rounded_n = pool_fxs->round_up_align(node->dim.size() * sizeof(float));
+      size += rounded_n;
+    }
+    begin = pool_fxs->allocate(size);
+
     for (; num_nodes_evaluated <= i; ++num_nodes_evaluated) {
       const Node* node = cg.nodes[num_nodes_evaluated];
       if (autobatch_debug_flag) {
@@ -116,19 +125,19 @@ const Tensor& SimpleExecutionEngine::incremental_forward(VariableIndex i) {
           "SimpleExecutionEngine::incremental_forward");
       node_fx.device = node->device;
       node_fx.mem_pool = DeviceMempool::FXS;
-      // Get the memory to store f(xs)
-      auto& node_fx_pools = node_fx.device->pools;
-      node_fx.v = static_cast<float*>(
-          node_fx_pools[(int)DeviceMempool::FXS]->allocate(
-              node->dim.size() * sizeof(float)));
+      // Get the memory
+      node_fx.v = static_cast<float*>(begin);
+      auto rounded_n = pool_fxs->round_up_align(node->dim.size() * sizeof(float));
+      begin += rounded_n; 
+
       if (node_fx.v == nullptr)
         DYNET_RUNTIME_ERR("Ran out of memory when executing node " <<
                           num_nodes_evaluated);
       void* aux_mem = nullptr;
       // Is the node requesting extra memory?
       size_t aux_size = node->aux_storage_size();
       if (aux_size) {
-        aux_mem = node_fx_pools[(int)DeviceMempool::FXS]->allocate(aux_size);
+        aux_mem = pool_fxs->allocate(aux_size);
         if (aux_mem == nullptr)
           DYNET_RUNTIME_ERR("Ran out of auxiliary memory when executing node "
                             << num_nodes_evaluated);
@@ -161,30 +170,37 @@ void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
 
   const unsigned num_nodes = from_where + 1;
   ndEdfs.resize(num_nodes);
-  const vector<Device*> &devices = device_manager->get_devices();
-  for(Device* device : devices)
-    device->pools[(int)DeviceMempool::DEDFS]->free();
+  pool_dEdfs->free();
 
   // This loop allocates memory on the appropriate devices for the nodes whose
   // derivatives will be computed.
+  // This assumes all of these use the same device!
+  unsigned size = 0;
+  void* begin;
+  for (unsigned i = 0; i < num_nodes; ++i) {
+    const Node* node = cg.nodes[i];
+    auto rounded_n = pool_dEdfs->round_up_align(node->dim.size() * sizeof(float));
+    size += rounded_n;
+  }
+  begin = pool_dEdfs->allocate(size);
+  pool_dEdfs->zero_allocated_memory();
+
   for (unsigned i = 0; i < num_nodes; ++i) {
     const auto dim = nfxs[i].d;
     auto& node_dEdfx = ndEdfs[i];
     node_dEdfx.d = dim;
     node_dEdfx.device = nfxs[i].device;
     node_dEdfx.mem_pool = DeviceMempool::DEDFS;
-    node_dEdfx.v = static_cast<float*>(
-        node_dEdfx.device->pools[(int)DeviceMempool::DEDFS]->allocate(
-            dim.size() * sizeof(float)));
+    node_dEdfx.v = static_cast<float*>(begin);
+    auto rounded_n = pool_dEdfs->round_up_align(dim.size() * sizeof(float));
+    begin += rounded_n;
+
     if (node_dEdfx.v == nullptr) {
       DYNET_RUNTIME_ERR(
           "out of memory while attempting to allocate space for "
           "derivatives of node " << i);
     }
   }
-  // Zero all derivative memory (which is contiguous on each device)
-  for (Device* device : devices)
-    device->pools[(int)DeviceMempool::DEDFS]->zero_allocated_memory();
 
   // initialize dE/dE = 1
   ndEdfs.back().v = cg.nodes.back()->device->kSCALAR_ONE;
 
@@ -2,6 +2,8 @@
 #define DYNET_EXEC_H
 
 #include "dynet/dynet.h"
+#include "dynet/aligned-mem-pool.h"
+#include <memory>
 
 namespace dynet {
 
@@ -24,6 +26,10 @@ class ExecutionEngine {
   virtual const Tensor& get_gradient(VariableIndex i) = 0;
   virtual void backward(bool full = false) = 0;
   virtual void backward(VariableIndex i, bool full = false) = 0;
+  AlignedMemoryPool* pool_fxs;
+  AlignedMemoryPool* pool_dEdfs;
+  MemAllocator* mem = nullptr;
+
  protected:
   explicit ExecutionEngine(const ComputationGraph& cg);
   DeviceManager* const device_manager;
@@ -34,7 +40,25 @@ class ExecutionEngine {
 class SimpleExecutionEngine : public ExecutionEngine {
  public:
   explicit SimpleExecutionEngine(const ComputationGraph& cg) :
-    ExecutionEngine(cg), num_nodes_evaluated(0) {}
+    ExecutionEngine(cg), num_nodes_evaluated(0) {
+    if (default_device->pools[0]->is_dynamic()) {
+      mem = new CPUAllocator();
+      pool_fxs   = new AlignedMemoryPool("CPU forward memory",  1 << 24, mem, 1 << 24, true);
+      pool_dEdfs = new AlignedMemoryPool("CPU backward memory", 1 << 24, mem, 1 << 24, true);
+    } else {
+      pool_fxs   = default_device->pools[(int)DeviceMempool::FXS];
+      pool_dEdfs = default_device->pools[(int)DeviceMempool::DEDFS];
+    }
+  }
+
+  ~SimpleExecutionEngine() {
+    if (default_device->pools[0]->is_dynamic()) {
+      delete pool_fxs;
+      delete pool_dEdfs;
+      delete mem;
+    }
+  }
+  
   void invalidate() override;
   void invalidate(unsigned i) override;
   const Tensor& forward() override;
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ ComputationGraph::ComputationGraph() {`
`111`	`111`	`} else {`
`112`	`112`	`ee.reset(new SimpleExecutionEngine(*this));`
`113`	`113`	`}`
`114`		`- if (n_hgs > 0) {`
	`114`	`+ if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {`
`115`	`115`	`cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";`
`116`	`116`	`throw std::runtime_error("Attempted to create >1 CG");`
`117`	`117`	`}`
`@@ -128,7 +128,7 @@ ComputationGraph::ComputationGraph(bool batched) {`
`128`	`128`	`} else {`
`129`	`129`	`ee.reset(new SimpleExecutionEngine(*this));`
`130`	`130`	`}`
`131`		`- if (n_hgs > 0) {`
	`131`	`+ if (!default_device->pools[0]->is_dynamic() && n_hgs > 0) {`
`132`	`132`	`cerr << "Memory allocator assumes only a single ComputationGraph at a time.\n";`
`133`	`133`	`throw std::runtime_error("Attempted to create >1 CG");`
`134`	`134`	`}`