solver restructuring: now all prototxt are specified in the solver protocol buffer

[jacinto-ai/caffe-jacinto.git] / src / caffe / net.cpp
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp

index 22d27436e77422bf418bf126af12709ec6eb6aa1..0c344faeb1cd3216230f64461285e9447214f81f 100644 (file)
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -6,7 +6,7 @@
  #include <vector>
  
  #include "caffe/proto/caffe.pb.h"
-#include "caffe/layer_factory.hpp"
+#include "caffe/layer.hpp"
  #include "caffe/net.hpp"
  
  using std::pair;
@@ -23,17 +23,18 @@ Net<Dtype>::Net(const NetParameter& param,
    map<string, int> blob_name_to_idx;
    set<string> available_blobs;
    int num_layers = param.layers_size();
-  CHECK_EQ(bottom.size(), param.bottom_size())
+  CHECK_EQ(bottom.size(), param.input_size())
        << "Incorrect bottom blob size.";
    // set the input blobs
-  for (int i = 0; i < param.bottom_size(); ++i) {
-    const string& blob_name = param.bottom(i);
+  for (int i = 0; i < param.input_size(); ++i) {
+    const string& blob_name = param.input(i);
      CHECK_GT(bottom[i]->count(), 0);
      shared_ptr<Blob<Dtype> > blob_pointer(
          new Blob<Dtype>(bottom[i]->num(), bottom[i]->channels(),
              bottom[i]->height(), bottom[i]->width()));
      blobs_.push_back(blob_pointer);
      blob_names_.push_back(blob_name);
+    blob_need_backward_.push_back(false);
      net_input_blob_indices_.push_back(i);
      blob_name_to_idx[blob_name] = i;
      available_blobs.insert(blob_name);
@@ -49,89 +50,147 @@ Net<Dtype>::Net(const NetParameter& param,
      layers_.push_back(shared_ptr<Layer<Dtype> >(GetLayer<Dtype>(layer_param)));
      layer_names_.push_back(layer_param.name());
      LOG(INFO) << "Creating Layer " << layer_param.name();
+    bool need_backward = false;
      // Figure out this layer's input and output
      for (int j = 0; j < layer_connection.bottom_size(); ++j) {
        const string& blob_name = layer_connection.bottom(j);
+      const int blob_id = blob_name_to_idx[blob_name];
        if (available_blobs.find(blob_name) == available_blobs.end()) {
          LOG(FATAL) << "Unknown blob input " << blob_name <<
              " to layer" << j;
        }
        LOG(INFO) << layer_param.name() << " <- " << blob_name;
        bottom_vecs_[i].push_back(
-          blobs_[blob_name_to_idx[blob_name]].get());
-      bottom_id_vecs_[i].push_back(blob_name_to_idx[blob_name]);
+          blobs_[blob_id].get());
+      bottom_id_vecs_[i].push_back(blob_id);
+      // If a blob needs backward, this layer should provide it.
+      need_backward |= blob_need_backward_[blob_id];
        available_blobs.erase(blob_name);
      }
      for (int j = 0; j < layer_connection.top_size(); ++j) {
        const string& blob_name = layer_connection.top(j);
-      if (blob_name_to_idx.find(blob_name) != blob_name_to_idx.end()) {
+      // Check if we are doing in-place computation
+      if (layer_connection.bottom_size() > j &&
+          blob_name == layer_connection.bottom(j)) {
+        // In-place computation
+        LOG(INFO) << layer_param.name() << " -> " << blob_name << " (in-place)";
+        available_blobs.insert(blob_name);
+        top_vecs_[i].push_back(
+            blobs_[blob_name_to_idx[blob_name]].get());
+        top_id_vecs_[i].push_back(blob_name_to_idx[blob_name]);
+      } else if (blob_name_to_idx.find(blob_name) != blob_name_to_idx.end()) {
+        // If we are not doing in-place computation but has duplicated blobs,
+        // raise an error.
          LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
+      } else {
+        // Normal output.
+        LOG(INFO) << layer_param.name() << " -> " << blob_name;
+        shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
+        blobs_.push_back(blob_pointer);
+        blob_names_.push_back(blob_name);
+        blob_need_backward_.push_back(false);
+        blob_name_to_idx[blob_name] = blob_names_.size() - 1;
+        available_blobs.insert(blob_name);
+        top_vecs_[i].push_back(blobs_[blob_names_.size() - 1].get());
+        top_id_vecs_[i].push_back(blob_names_.size() - 1);
        }
-      LOG(INFO) << layer_param.name() << " -> " << blob_name;
-      shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
-      blobs_.push_back(blob_pointer);
-      blob_names_.push_back(blob_name);
-      blob_name_to_idx[blob_name] = blob_names_.size() - 1;
-      available_blobs.insert(blob_name);
-      top_vecs_[i].push_back(blobs_[blob_names_.size() - 1].get());
-      top_id_vecs_[i].push_back(blob_names_.size() - 1);
      }
-  }
-  LOG(INFO) << "Checking top blobs.";
-  // In the end, check if all remaining available blobs are top blobs.
-  for (int i = 0; i < param.top_size(); ++i) {
-    const string& blob_name = param.top(i);
-    if (blob_name_to_idx.find(blob_name) == blob_name_to_idx.end()) {
-      LOG(FATAL) << "Unknown blob output " << blob_name;
+    // After this layer is connected, set it up.
+    // LOG(INFO) << "Setting up " << layer_names_[i];
+    layers_[i]->SetUp(bottom_vecs_[i], &top_vecs_[i]);
+    for (int topid = 0; topid < top_vecs_[i].size(); ++topid) {
+      LOG(INFO) << "Top shape: " << top_vecs_[i][topid]->channels() << " "
+          << top_vecs_[i][topid]->height() << " "
+          << top_vecs_[i][topid]->width();
      }
-    net_output_blob_indices_.push_back(blob_name_to_idx[blob_name]);
-    available_blobs.erase(blob_name);
-  }
-  if (!available_blobs.empty()) {
-    LOG(WARNING) << "There are some internal blobs not used:";
-    for (set<string>::iterator it = available_blobs.begin();
-        it != available_blobs.end(); ++it) {
-      LOG(WARNING) << "    " << *it;
+    // Check if this layer needs backward operation itself
+    for (int j = 0; j < layers_[i]->layer_param().blobs_lr_size(); ++j) {
+      need_backward |= (layers_[i]->layer_param().blobs_lr(j) > 0);
      }
+    // Finally, set the backward flag
+    layer_need_backward_.push_back(need_backward);
+    if (need_backward) {
+      LOG(INFO) << layer_names_[i] << " needs backward computation.";
+      for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
+        blob_need_backward_[top_id_vecs_[i][j]] = true;
+      }
+    } else {
+      LOG(INFO) << layer_names_[i] << " does not need backward computation.";
+    }
+  }
+  // In the end, all remaining blobs are considered output blobs.
+  for (set<string>::iterator it = available_blobs.begin();
+      it != available_blobs.end(); ++it) {
+    LOG(INFO) << "This network produces output " << *it;
+    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
+    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
    }
+  GetLearningRateAndWeightDecay();
+  LOG(INFO) << "Network initialization done.";
+}
+
  
-  LOG(INFO) << "Setting up the layers.";
+template <typename Dtype>
+void Net<Dtype>::GetLearningRateAndWeightDecay() {
+  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
    for (int i = 0; i < layers_.size(); ++i) {
-    LOG(INFO) << "Setting up " << layer_names_[i];
-    layers_[i]->SetUp(bottom_vecs_[i], &top_vecs_[i]);
-    vector<shared_ptr<Blob<Dtype> > >& layer_params = layers_[i]->params();
-    for (int j = 0; j < layer_params.size(); ++j) {
-      params_.push_back(layer_params[j]);
+    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
+    for (int j = 0; j < layer_blobs.size(); ++j) {
+      params_.push_back(layer_blobs[j]);
+    }
+    // push the learning rate mutlipliers
+    if (layers_[i]->layer_param().blobs_lr_size()) {
+      CHECK_EQ(layers_[i]->layer_param().blobs_lr_size(), layer_blobs.size());
+      for (int j = 0; j < layer_blobs.size(); ++j) {
+        float local_lr = layers_[i]->layer_param().blobs_lr(j);
+        CHECK_GE(local_lr, 0.);
+        params_lr_.push_back(local_lr);
+      }
+    } else {
+      for (int j = 0; j < layer_blobs.size(); ++j) {
+        params_lr_.push_back(1.);
+      }
+    }
+    // push the weight decay multipliers
+    if (layers_[i]->layer_param().weight_decay_size()) {
+      CHECK_EQ(layers_[i]->layer_param().weight_decay_size(),
+          layer_blobs.size());
+      for (int j = 0; j < layer_blobs.size(); ++j) {
+        float local_decay = layers_[i]->layer_param().weight_decay(j);
+        CHECK_GE(local_decay, 0.);
+        params_weight_decay_.push_back(local_decay);
+      }
+    } else {
+      for (int j = 0; j < layer_blobs.size(); ++j) {
+        params_weight_decay_.push_back(1.);
+      }
      }
    }
-
-  LOG(INFO) << "Network initialization done.";
  }
  
  template <typename Dtype>
-void Net<Dtype>::Forward(const vector<Blob<Dtype>*> & bottom,
-    vector<Blob<Dtype>*>* top) {
+const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
+    const vector<Blob<Dtype>*> & bottom) {
    // Copy bottom to internal bottom
    for (int i = 0; i < bottom.size(); ++i) {
      blobs_[net_input_blob_indices_[i]]->CopyFrom(*bottom[i]);
    }
    for (int i = 0; i < layers_.size(); ++i) {
+    // LOG(ERROR) << "Forwarding " << layer_names_[i];
      layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]);
    }
-  // Copy internal top to top
-  for (int i = 0; i < (*top).size(); ++i) {
-    (*top)[i]->CopyFrom(*blobs_[net_output_blob_indices_[i]]);
-  }
+  return net_output_blobs_;
  }
  
  template <typename Dtype>
  Dtype Net<Dtype>::Backward() {
    Dtype loss = 0;
-  // TODO(Yangqing): figure out those layers that do not need backward.
    for (int i = layers_.size() - 1; i >= 0; --i) {
-    Dtype layer_loss = layers_[i]->Backward(
-        top_vecs_[i], true, &bottom_vecs_[i]);
-    loss += layer_loss;
+    if (layer_need_backward_[i]) {
+      Dtype layer_loss = layers_[i]->Backward(
+          top_vecs_[i], true, &bottom_vecs_[i]);
+      loss += layer_loss;
+    }
    }
    return loss;
  }
@@ -148,15 +207,19 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
        ++target_layer_id;
      }
      if (target_layer_id == layer_names_.size()) {
-      LOG(INFO) << "Ignoring source layer " << source_layer_name;
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
        continue;
      }
-    LOG(INFO) << "Loading source layer " << source_layer_name;
+    DLOG(INFO) << "Loading source layer " << source_layer_name;
      vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-        layers_[target_layer_id]->params();
+        layers_[target_layer_id]->blobs();
      CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
          << "Incompatible number of blobs for layer " << source_layer_name;
      for (int j = 0; j < target_blobs.size(); ++j) {
+      CHECK_EQ(target_blobs[j]->num(), source_layer.blobs(j).num());
+      CHECK_EQ(target_blobs[j]->channels(), source_layer.blobs(j).channels());
+      CHECK_EQ(target_blobs[j]->height(), source_layer.blobs(j).height());
+      CHECK_EQ(target_blobs[j]->width(), source_layer.blobs(j).width());
        target_blobs[j]->FromProto(source_layer.blobs(j));
      }
    }
@@ -168,21 +231,19 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) {
    param->set_name(name_);
    // Add bottom and top
    for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
-    param->add_bottom(blob_names_[net_input_blob_indices_[i]]);
-  }
-  for (int i = 0; i < net_output_blob_indices_.size(); ++i) {
-    param->add_top(blob_names_[net_output_blob_indices_[i]]);
+    param->add_input(blob_names_[net_input_blob_indices_[i]]);
    }
+  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
    for (int i = 0; i < layers_.size(); ++i) {
      LayerConnection* layer_connection = param->add_layers();
-    for (int j = 0; j < bottom_id_vecs_[i].size(); ++i) {
+    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
        layer_connection->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
      }
-    for (int j = 0; j < top_id_vecs_[i].size(); ++i) {
+    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
        layer_connection->add_top(blob_names_[top_id_vecs_[i][j]]);
      }
      LayerParameter* layer_parameter = layer_connection->mutable_layer();
-    layers_[i]->ToProto(layer_parameter);
+    layers_[i]->ToProto(layer_parameter, write_diff);
    }
  }