1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T gemm_fp
39 typedef void (*FUNCPTR_T)( obj_t* a,
40 obj_t* b,
41 obj_t* c,
42 gemm_t* cntl,
43 gemm_thrinfo_t* thread );
45 static FUNCPTR_T vars[6][3] =
46 {
47 // unblocked optimized unblocked blocked
48 { NULL, NULL, bli_gemm_blk_var1f },
49 { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2f },
50 { NULL, NULL, bli_gemm_blk_var3f },
51 { NULL, NULL, NULL },
52 { NULL, NULL, NULL },
53 { NULL, NULL, NULL }
54 };
56 void bli_gemm_int( obj_t* alpha,
57 obj_t* a,
58 obj_t* b,
59 obj_t* beta,
60 obj_t* c,
61 gemm_t* cntl,
62 gemm_thrinfo_t* thread )
63 {
64 obj_t a_local;
65 obj_t b_local;
66 obj_t c_local;
67 varnum_t n;
68 impl_t i;
69 FUNCPTR_T f;
70 #if defined(BLIS_ENABLE_PROFILE)
71 volatile uint64_t counter_start;
72 volatile uint64_t counter_end;
73 extern profile_data_t *bli_gemm_profile_data;
74 dim_t m_var, k_var, n_var;
75 dim_t index;
76 #endif
78 // Extract the variant number and implementation type.
79 n = cntl_var_num( cntl );
80 i = cntl_impl_type( cntl );
82 // Check parameters.
83 if ( bli_error_checking_is_enabled() )
84 bli_gemm_int_check( alpha, a, b, beta, c, cntl ); // creating the errors. print sizes of a,b,c
86 // If C has a zero dimension, return early.
87 if ( bli_obj_has_zero_dim( *c ) ) return;
89 // If A or B has a zero dimension, scale C by beta and return early.
90 if ( bli_obj_has_zero_dim( *a ) ||
91 bli_obj_has_zero_dim( *b ) )
92 {
93 if( thread_am_ochief( thread ) )
94 bli_scalm( beta, c );
95 thread_obarrier( thread );
96 return;
97 }
99 // If A or B is marked as being filled with zeros, scale C by beta and
100 // return early.
101 if ( bli_obj_is_zeros( *a ) ||
102 bli_obj_is_zeros( *b ) )
103 {
104 if( thread_am_ochief( thread ) )
105 bli_scalm( beta, c );
106 thread_obarrier( thread );
107 return;
108 }
110 // Alias A and B in case we need to update attached scalars.
111 bli_obj_alias_to( *a, a_local );
112 bli_obj_alias_to( *b, b_local );
114 // Alias C in case we need to induce a transposition.
115 bli_obj_alias_to( *c, c_local );
117 // If we are about to call a leaf-level implementation, and matrix C
118 // still needs a transposition, then we must induce one by swapping the
119 // strides and dimensions. Note that this transposition would normally
120 // be handled explicitly in the packing of C, but if C is not being
121 // packed, this is our last chance to handle the transposition.
122 if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
123 {
124 //if( thread_am_ochief( thread ) ) {
125 bli_obj_induce_trans( c_local );
126 bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
127 // }
128 }
130 // If alpha is non-unit, typecast and apply it to the scalar attached
131 // to B.
132 if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
133 {
134 bli_obj_scalar_apply_scalar( alpha, &b_local );
135 }
138 // If beta is non-unit, typecast and apply it to the scalar attached
139 // to C.
140 if ( !bli_obj_equals( beta, &BLIS_ONE ) )
141 {
142 bli_obj_scalar_apply_scalar( beta, &c_local );
143 }
145 // Extract the variant number and implementation type.
146 n = cntl_var_num( cntl );
147 i = cntl_impl_type( cntl );
149 // Index into the variant array to extract the correct function pointer.
150 f = vars[n][i]; // print out n and i
152 // Invoke the variant.
153 #if defined(BLIS_ENABLE_PROFILE)
154 m_var = bli_obj_length( c_local );
155 k_var = bli_obj_width_after_trans( a_local );
156 n_var = bli_obj_width( c_local );
158 #if defined(BLIS_ENABLE_C66X_BUILD)
159 TSCL = 0;
160 counter_start = lib_clock64();
161 #else
162 counter_start = (uint64_t) (bli_clock()*1.2e9);
163 #endif
164 #endif
165 f( &a_local,
166 &b_local,
167 &c_local,
168 cntl,
169 thread );
171 #if defined(BLIS_ENABLE_PROFILE)
172 #if defined(BLIS_ENABLE_C66X_BUILD)
173 counter_end = lib_clock64();
174 #else
175 counter_end = (uint64_t) (bli_clock()*1.2e9);
176 #endif
177 bli_profile_get_index(n, i, index);
178 bli_profile_data_update(bli_gemm_profile_data[index], (counter_end-counter_start), m_var*k_var*n_var);
179 #endif
180 }