]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - jacinto-ai/caffe-jacinto.git/blobdiff - src/caffe/layers/detection_output_layer.cpp
Keypoint inference support for CPU only flow
[jacinto-ai/caffe-jacinto.git] / src / caffe / layers / detection_output_layer.cpp
index 0e9b619346a17b4ee0c3bdabac1cccf1843790b3..8d7ab2f382f952bdffb9fd205ccaa10a22ad691e 100644 (file)
@@ -121,11 +121,16 @@ void DetectionOutputLayer<Ftype, Btype>::LayerSetUp(const vector<Blob*>& bottom,
     bbox_permute_.ReshapeLike(*(bottom[0]));
   }
   conf_permute_.ReshapeLike(*(bottom[1]));
+  num_keypoint_ = detection_output_param.num_keypoint();
+  output_size_ = num_keypoint_*2 + 7;
+  code_size_ = num_keypoint_*2 + 4;
 }
 
 template <typename Ftype, typename Btype>
 void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
       const vector<Blob*>& top) {
+  const int code_size_ = num_keypoint_*2 + 4;
+
   if (need_save_) {
     CHECK_LE(name_count_, names_.size());
     if (name_count_ % num_test_image_ == 0) {
@@ -159,8 +164,9 @@ void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
       conf_permute_.count(1) != bottom[1]->count(1)) {
     conf_permute_.ReshapeLike(*(bottom[1]));
   }
-  num_priors_ = bottom[2]->height() / 4;
-  CHECK_EQ(num_priors_ * num_loc_classes_ * 4, bottom[0]->channels())
+  num_priors_ = bottom[2]->height() / code_size_;
+
+  CHECK_EQ(num_priors_ * num_loc_classes_ * code_size_, bottom[0]->channels())
       << "Number of priors must match number of location predictions.";
   CHECK_EQ(num_priors_ * num_classes_, bottom[1]->channels())
       << "Number of priors must match number of confidence predictions.";
@@ -171,13 +177,14 @@ void DetectionOutputLayer<Ftype, Btype>::Reshape(const vector<Blob*>& bottom,
   top_shape.push_back(1);
   // Each row is a 7 dimension vector, which stores
   // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-  top_shape.push_back(7);
+  top_shape.push_back(output_size_);
   top[0]->Reshape(top_shape);
 }
 
 template <typename Ftype, typename Btype>
 void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
     const vector<Blob*>& bottom, const vector<Blob*>& top) {
+
   const Ftype* loc_data = bottom[0]->cpu_data<Ftype>();
   const Ftype* conf_data = bottom[1]->cpu_data<Ftype>();
   const Ftype* prior_data = bottom[2]->cpu_data<Ftype>();
@@ -186,18 +193,16 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
   // Retrieve all location predictions.
   vector<LabelBBox> all_loc_preds;
   GetLocPredictions(loc_data, num, num_priors_, num_loc_classes_,
-                    share_location_, &all_loc_preds);
-
+                    share_location_, &all_loc_preds, num_keypoint_);
   // Retrieve all confidences.
   vector<map<int, vector<float> > > all_conf_scores;
   GetConfidenceScores(conf_data, num, num_priors_, num_classes_,
                       &all_conf_scores);
-
   // Retrieve all prior bboxes. It is same within a batch since we assume all
   // images in a batch are of same dimension.
   vector<NormalizedBBox> prior_bboxes;
   vector<vector<float> > prior_variances;
-  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
+  GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances, code_size_);
 
   // Decode all loc predictions to bboxes.
   vector<LabelBBox> all_decode_bboxes;
@@ -275,7 +280,7 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
 
   vector<int> top_shape(2, 1);
   top_shape.push_back(num_kept);
-  top_shape.push_back(7);
+  top_shape.push_back(output_size_);
   Ftype* top_data;
   if (num_kept == 0) {
     LOG(INFO) << "Couldn't find any detections";
@@ -286,7 +291,7 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
     // Generate fake results per image.
     for (int i = 0; i < num; ++i) {
       top_data[0] = i;
-      top_data += 7;
+      top_data += output_size_;
     }
   } else {
     top[0]->Reshape(top_shape);
@@ -323,19 +328,25 @@ void DetectionOutputLayer<Ftype, Btype>::Forward_cpu(
       }
       for (int j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
-        top_data[count * 7] = i;
-        top_data[count * 7 + 1] = label;
-        top_data[count * 7 + 2] = scores[idx];
+        top_data[count * output_size_] = i;
+        top_data[count * output_size_ + 1] = label;
+        top_data[count * output_size_ + 2] = scores[idx];
         const NormalizedBBox& bbox = bboxes[idx];
-        top_data[count * 7 + 3] = bbox.xmin();
-        top_data[count * 7 + 4] = bbox.ymin();
-        top_data[count * 7 + 5] = bbox.xmax();
-        top_data[count * 7 + 6] = bbox.ymax();
+        top_data[count * output_size_ + 3] = bbox.xmin();
+        top_data[count * output_size_ + 4] = bbox.ymin();
+        top_data[count * output_size_ + 5] = bbox.xmax();
+        top_data[count * output_size_ + 6] = bbox.ymax();
+
+        for ( int i = 7; i < output_size_; i+=2){
+          top_data[count * output_size_ + i] = bbox.keypoint_x((int)((i-7)/2));
+          top_data[count * output_size_ + i + 1] = bbox.keypoint_y((int)((i-7)/2));
+        }
+
         if (need_save_) {
           NormalizedBBox out_bbox;
           OutputBBox(bbox, sizes_[name_count_], has_resize_, resize_param_,
                      &out_bbox);
-          float score = top_data[count * 7 + 2];
+          float score = top_data[count * output_size_ + 2];
           float xmin = out_bbox.xmin();
           float ymin = out_bbox.ymin();
           float xmax = out_bbox.xmax();