diff --git a/src/caffeine/layers/inner_product_layer.cu b/src/caffeine/layers/inner_product_layer.cu
index 8130dcc01ba785657572ecf7d2969814d0d0c7d3..5afe1469cbdb6dfca260275b65039c5ebefb9814 100644 (file)
}
template <typename Dtype>
-__global__ void BroadcastCopy(const int total, const int vec_len,
- const Dtype* in_vec, Dtype* out_matrix) {
+__global__ void BroadcastRow(const int total, const int vec_len,
+ const Dtype* in_vec, Dtype* out_matrix) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < total) {
int v_index = index % vec_len;
}
}
+
+
template <typename Dtype>
void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
@@ -113,12 +115,12 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const Dtype* bias = NULL;
Dtype alpha = 1., beta = 0.;
if (biasterm_) {
- bias = this->blobs_[1].gpu_data();
- beta = 1.;
- const int count = (*top)[0]->count();
- // we pre-copy the bias to the results, and then call gemm.
- BroadcastCopy<<<CAFFEINE_GET_BLOCKS(count), CAFFEINE_CUDA_NUM_THREADS>>>(
- count, N_, bias, top_data);
+ bias = this->blobs_[1].gpu_data();
+ beta = 1.;
+ const int count = (*top)[0]->count();
+ // we pre-copy the bias to the results, and then call gemm.
+ BroadcastRow<<<CAFFEINE_GET_BLOCKS(count), CAFFEINE_CUDA_NUM_THREADS>>>(
+ count, N_, bias, top_data);
}
switch(sizeof(Dtype)) {
case sizeof(float):