Keypoint inference support for CPU only flow

[jacinto-ai/caffe-jacinto.git] / src / caffe / layers / detection_output_layer.cpp
diff --git a/src/caffe/layers/detection_output_layer.cpp b/src/caffe/layers/detection_output_layer.cpp

index 0e9b619346a17b4ee0c3bdabac1cccf1843790b3..8d7ab2f382f952bdffb9fd205ccaa10a22ad691e 100644 (file)
--- a/src/caffe/layers/detection_output_layer.cpp
+++ b/src/caffe/layers/detection_output_layer.cpp
@@ -121,11 +121,16 @@ void DetectionOutputLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
      bbox_permute_.ReshapeLike(*(bottom[0]));
    }
    conf_permute_.ReshapeLike(*(bottom[1]));
+  num_keypoint_ = detection_output_param.num_keypoint();
+  output_size_ = num_keypoint_*2 + 7;
+  code_size_ = num_keypoint_*2 + 4;
  }
  
  template <typename Ftype, typename Btype>
  void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
        const vector<Blob*>& top) {
+  const int code_size_ = num_keypoint_*2 + 4;
+
    if (need_save_) {
      CHECK_LE(name_count_, names_.size());
      if (name_count_ % num_test_image_ == 0) {
@@ -159,8 +164,9 @@ void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
        conf_permute_.count(1) != bottom[1]->count(1)) {
      conf_permute_.ReshapeLike(*(bottom[1]));
    }
-  num_priors_ = bottom[2]->height() / 4;
-  CHECK_EQ(num_priors_ * num_loc_classes_ * 4, bottom[0]->channels())
+  num_priors_ = bottom[2]->height() / code_size_;
+
+  CHECK_EQ(num_priors_ * num_loc_classes_ * code_size_, bottom[0]->channels())
        << "Number of priors must match number of location predictions.";
    CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
        << "Number of priors must match number of confidence predictions.";
@@ -171,13 +177,14 @@ void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
    top_shape.push_back(1);
    // Each row is a 7 dimension vector, which stores
    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-  top_shape.push_back(7);
+  top_shape.push_back(output_size_);
    top[0]->Reshape(top_shape);
  }
  
  template <typename Ftype, typename Btype>
  void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
      const vector<Blob*>& bottom, const vector<Blob*>& top) {
+
    const Ftype* loc_data = bottom[0]->cpu_data<Ftype>();
    const Ftype* conf_data = bottom[1]->cpu_data<Ftype>();
    const Ftype* prior_data = bottom[2]->cpu_data<Ftype>();
@@ -186,18 +193,16 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
    // Retrieve all location predictions.
    vector<LabelBBox> all_loc_preds;
    GetLocPredictions(loc_data, num, num_priors_, num_loc_classes_,
-                    share_location_, &all_loc_preds);
-
+                    share_location_, &all_loc_preds, num_keypoint_);
    // Retrieve all confidences.
    vector<map<int, vector<float> > > all_conf_scores;
    GetConfidenceScores(conf_data, num, num_priors_, num_classes_,
                        &all_conf_scores);
-
    // Retrieve all prior bboxes. It is same within a batch since we assume all
    // images in a batch are of same dimension.
    vector<NormalizedBBox> prior_bboxes;
    vector<vector<float> > prior_variances;
-  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
+  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances, code_size_);
  
    // Decode all loc predictions to bboxes.
    vector<LabelBBox> all_decode_bboxes;
@@ -275,7 +280,7 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
  
    vector<int> top_shape(2, 1);
    top_shape.push_back(num_kept);
-  top_shape.push_back(7);
+  top_shape.push_back(output_size_);
    Ftype* top_data;
    if (num_kept == 0) {
      LOG(INFO) << "Couldn't find any detections";
@@ -286,7 +291,7 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
      // Generate fake results per image.
      for (int i = 0; i < num; ++i) {
        top_data[0] = i;
-      top_data += 7;
+      top_data += output_size_;
      }
    } else {
      top[0]->Reshape(top_shape);
@@ -323,19 +328,25 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
        }
        for (int j = 0; j < indices.size(); ++j) {
          int idx = indices[j];
-        top_data[count * 7] = i;
-        top_data[count * 7 + 1] = label;
-        top_data[count * 7 + 2] = scores[idx];
+        top_data[count * output_size_] = i;
+        top_data[count * output_size_ + 1] = label;
+        top_data[count * output_size_ + 2] = scores[idx];
          const NormalizedBBox& bbox = bboxes[idx];
-        top_data[count * 7 + 3] = bbox.xmin();
-        top_data[count * 7 + 4] = bbox.ymin();
-        top_data[count * 7 + 5] = bbox.xmax();
-        top_data[count * 7 + 6] = bbox.ymax();
+        top_data[count * output_size_ + 3] = bbox.xmin();
+        top_data[count * output_size_ + 4] = bbox.ymin();
+        top_data[count * output_size_ + 5] = bbox.xmax();
+        top_data[count * output_size_ + 6] = bbox.ymax();
+
+        for ( int i = 7; i < output_size_; i+=2){
+          top_data[count * output_size_ + i] = bbox.keypoint_x((int)((i-7)/2));
+          top_data[count * output_size_ + i + 1] = bbox.keypoint_y((int)((i-7)/2));
+        }
+
          if (need_save_) {
            NormalizedBBox out_bbox;
            OutputBBox(bbox, sizes_[name_count_], has_resize_, resize_param_,
                       &out_bbox);
-          float score = top_data[count * 7 + 2];
+          float score = top_data[count * output_size_ + 2];
            float xmin = out_bbox.xmin();
            float ymin = out_bbox.ymin();
            float xmax = out_bbox.xmax();