1 // cudamatrix/cu-sp-matrix.cc
3 // Copyright 2013 Karel Vesely
4 // 2014-2015 Johns Hopkins University (author: Daniel Povey)
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
21 #if HAVE_CUDA == 1
22 #include <cuda_runtime_api.h>
23 #include <cublas_v2.h>
24 #endif
26 #include "base/timer.h"
27 #include "cudamatrix/cu-common.h"
28 #include "cudamatrix/cu-vector.h"
29 #include "cudamatrix/cu-device.h"
30 #include "cudamatrix/cu-kernels.h"
31 #include "cudamatrix/cu-math.h"
32 #include "cudamatrix/cu-sp-matrix.h"
33 #include "cudamatrix/cu-matrix.h"
34 #include "cudamatrix/cublas-wrappers.h"
36 namespace kaldi {
38 template<typename Real>
39 void CuSpMatrix<Real>::CopyFromMat(const CuMatrixBase<Real> &M,
40 SpCopyType copy_type) {
41 KALDI_ASSERT(this->num_rows_ == M.NumRows() &&
42 this->num_rows_ == M.NumCols());
43 if (this->num_rows_ == 0)
44 return;
45 #if HAVE_CUDA == 1
46 if (CuDevice::Instantiate().Enabled()) {
47 CuTimer tim;
48 MatrixIndexT D = this->NumRows();
49 if (D == 0)
50 return;
51 switch (copy_type) {
52 case kTakeMeanAndCheck:
53 KALDI_ERR << "kTakeMeanAndCheck not supported!";
54 // The grid/block dimensions have been very roughly tuned for the
55 // individual cases.
56 case kTakeMean:
57 {
58 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
59 dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
60 cuda_take_mean(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
61 CU_SAFE_CALL(cudaGetLastError());
62 }
63 break;
64 case kTakeLower:
65 {
66 int32 block_size = std::min(CU1DBLOCK, this->num_rows_);
67 dim3 dimBlock(1, block_size);
68 dim3 dimGrid(D, n_blocks(D, block_size));
69 cuda_take_lower(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
70 CU_SAFE_CALL(cudaGetLastError());
71 }
72 break;
73 case kTakeUpper:
74 {
75 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
76 dim3 dimGrid(n_blocks(D, CU2DBLOCK), n_blocks(D, CU2DBLOCK));
77 cuda_take_upper(dimGrid, dimBlock, M.Data(), this->data_, M.Dim());
78 CU_SAFE_CALL(cudaGetLastError());
79 }
80 break;
81 default:
82 KALDI_ASSERT("Invalid argument to CuSpMatrix::CopyFromMat");
83 }
84 CuDevice::Instantiate().AccuProfile("CuSpMatrix::CopyFromMat(from CuMatrixBase)", tim);
85 } else
86 #endif
87 {
88 Mat().CopyFromMat(M.Mat(), copy_type);
89 }
90 }
92 template<typename Real>
93 void CuSpMatrix<Real>::Invert() {
94 #if HAVE_CUDA == 1
95 if (CuDevice::Instantiate().Enabled()) {
96 CuMatrix<Real> mat(this->num_rows_, this->num_rows_);
97 mat.CopyFromSp(*this);
98 mat.SymInvertPosDef();
99 this->CopyFromMat(mat);
100 } else
101 #endif
102 { // Use inversion of CPU-based SpMatrix.
103 Mat().Invert();
104 }
105 }
107 template<typename Real>
108 void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
109 KALDI_ASSERT(v.Dim() == this->NumRows());
110 #if HAVE_CUDA == 1
111 if (CuDevice::Instantiate().Enabled()) {
112 if (this->num_rows_ == 0) return;
113 CuTimer tim;
114 size_t nr = this->num_rows_;
115 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
116 dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
118 CUBLAS_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(),
119 1, this->Data()));
121 CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim);
122 } else
123 #endif
124 {
125 Mat().AddVec2(alpha, v.Vec());
126 }
127 }
129 template<typename Real>
130 void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
131 MatrixTransposeType transM, const Real beta) {
132 KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
133 || (transM == kTrans && this->NumRows() == M.NumCols()));
135 #if HAVE_CUDA == 1
136 if (CuDevice::Instantiate().Enabled()) {
137 if (this->num_rows_ == 0) return;
138 CuTimer tim;
139 MatrixIndexT this_dim = this->NumRows(),
140 m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
142 if (this_dim == 0) return;
143 if (alpha == 0.0) {
144 if (beta != 1.0) this->Scale(beta);
145 return;
146 }
148 cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
150 CuMatrix<Real> tmp_mat(*this);
151 cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(),
152 M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
153 this->CopyFromMat(tmp_mat, kTakeLower);
155 CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddMat2", tim);
156 } else
157 #endif
158 {
159 Mat().AddMat2(alpha, M.Mat(), transM, beta);
160 }
161 }
163 /**
164 * C++ templatd wrapper of ANSI-C CUBLAS function GEMM (matrix multiply)
165 */
167 template<typename Real, typename OtherReal>
168 Real TraceSpSp(const CuSpMatrix<Real> &A, const CuSpMatrix<OtherReal> &B) {
169 KALDI_ASSERT(A.NumRows() == B.NumRows());
170 #if HAVE_CUDA == 1
171 if (CuDevice::Instantiate().Enabled()) {
172 if (A.NumRows() == 0) return 0.0;
173 MatrixIndexT nr = A.NumRows(), size = nr * (nr+1) / 2;
174 CuVector<Real> Adiag(nr, kUndefined);
175 CuVector<OtherReal> Bdiag(nr, kUndefined);
176 Adiag.CopyDiagFromPacked(A);
177 Bdiag.CopyDiagFromPacked(B);
178 CuSubVector<Real> Aall(A.Data(), size);
179 CuSubVector<OtherReal> Ball(B.Data(), size);
180 // Below, we subtrace VecVec(Adiag, Bdiag) to remove double-counting
181 // on the diagonal.
182 return 2.0 * VecVec(Aall, Ball) - VecVec(Adiag, Bdiag);
183 } else
184 #endif
185 {
186 return TraceSpSp(A.Mat(), B.Mat());
187 }
188 }
189 template
190 float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<float> &B);
191 template
192 float TraceSpSp(const CuSpMatrix<float> &A, const CuSpMatrix<double> &B);
193 template
194 double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<float> &B);
195 template
196 double TraceSpSp(const CuSpMatrix<double> &A, const CuSpMatrix<double> &B);
199 template<typename Real>
200 bool CuSpMatrix<Real>::ApproxEqual(const CuSpMatrix<Real> &B, Real tol) const {
201 KALDI_ASSERT(this->NumRows() == B.NumRows());
202 CuSpMatrix<Real> diff(*this);
203 diff.AddSp(-1.0, B);
204 Real a = this->FrobeniusNorm(), b = B.FrobeniusNorm(),
205 d = diff.FrobeniusNorm();
206 return (d <= tol * std::max(a, b));
207 }
209 template<typename Real>
210 bool CuSpMatrix<Real>::IsUnit(Real tol) const {
211 // want to return:
212 //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
213 //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
214 // trace((*this - I)(*this - I)) <= tol * NumRows()
215 // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
216 // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
218 // Note: we could do this more efficiently still, by slightly changing the
219 // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
220 // that corrects for the diagonal being counted twice.
221 return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
222 tol * this->NumRows());
223 }
225 template <class Real>
226 CuSpMatrix<Real>& CuSpMatrix<Real>::operator = (const CuSpMatrix<Real> &in) {
227 this->Resize(in.NumRows(), kUndefined);
228 this->CopyFromPacked(in);
229 return *this;
230 }
232 template class CuSpMatrix<float>;
233 template class CuSpMatrix<double>;
237 } // namespace