1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #ifdef BLIS_TREE_BARRIER
38 void bli_free_barrier_tree( barrier_t* barrier )
39 {
40 if( barrier == NULL )
41 return;
42 barrier->count--;
43 if( barrier->count == 0 )
44 {
45 bli_free_barrier_tree( barrier->dad );
46 bli_free( barrier );
47 }
48 return;
49 }
50 barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index)
51 {
52 barrier_t* me = (barrier_t*) bli_malloc(sizeof(barrier_t));
54 me->dad = NULL;
55 me->signal = 0;
57 // Base Case
58 if( num_threads <= arity ) {
59 //Now must be registered as a leaf
60 for(int i = 0; i < num_threads; i++)
61 {
62 leaves[leaf_index + i] = me;
63 }
64 me->count = num_threads;
65 me->arity = num_threads;
66 }
67 else {
68 // Otherwise this node has children
69 int threads_per_kid = num_threads / arity;
70 int defecit = num_threads - threads_per_kid * arity;
72 for(int i = 0; i < arity; i++){
73 int threads_this_kid = threads_per_kid;
74 if(i < defecit) threads_this_kid++;
76 barrier_t* kid = bli_create_tree_barrier(threads_this_kid, arity, leaves, leaf_index);
77 kid->dad = me;
79 leaf_index += threads_this_kid;
80 }
81 me->count = arity;
82 me->arity = arity;
83 }
85 return me;
86 }
88 void bli_cleanup_communicator( thread_comm_t* communicator )
89 {
90 if( communicator == NULL ) return;
91 for( dim_t i = 0; i < communicator->n_threads; i++)
92 {
93 bli_free_barrier_tree( communicator->barriers[i] );
94 }
95 bli_free( communicator->barriers );
96 }
97 void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads)
98 {
99 if( communicator == NULL ) return;
100 communicator->sent_object = NULL;
101 communicator->n_threads = n_threads;
102 communicator->barriers = ( barrier_t** ) bli_malloc( sizeof( barrier_t* ) * n_threads );
103 bli_create_tree_barrier( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 );
104 }
106 void tree_barrier( barrier_t* barack )
107 {
108 #ifdef BLIS_ENABLE_OPENMP
109 int my_signal = barack->signal;
110 int my_count;
112 #ifdef BLIS_ENABLE_C66X_BUILD
113 _Pragma("omp atomic")
114 if(communicator == NULL || communicator->n_threads == 1)
115 return;
116 bool_t my_sense = communicator->barrier_sense;
117 dim_t my_threads_arrived;
119 _Pragma("omp atomic")
120 my_threads_arrived = ++(communicator->barrier_threads_arrived);
122 if( my_threads_arrived == communicator->n_threads ) {
123 communicator->barrier_threads_arrived = 0;
124 communicator->barrier_sense = !communicator->barrier_sense;
125 }
126 else {
127 volatile bool_t* listener = &communicator->barrier_sense;
128 while( *listener == my_sense ) {
129 CACHE_invL1d(communicator,sizeof(thread_comm_t),CACHE_WAIT);
130 CACHE_invL2(communicator,sizeof(thread_comm_t),CACHE_WAIT);
131 }
132 }
133 #else
134 _Pragma("omp atomic capture")
135 my_count = barack->count--;
137 if( my_count == 1 ) {
138 if( barack->dad != NULL ) {
139 tree_barrier( barack->dad );
140 }
141 barack->count = barack->arity;
142 barack->signal = !barack->signal;
143 }
144 else {
145 volatile int* listener = &barack->signal;
146 while( *listener == my_signal ) {
147 }
148 }
149 #endif
150 #else
151 return
152 #endif
153 }
155 void bli_barrier( thread_comm_t* comm, dim_t t_id )
156 {
157 tree_barrier( comm->barriers[t_id] );
158 }
160 //-----------end of #ifdef BLIS_TREE_BARRIER
161 #else
163 void bli_cleanup_communicator( thread_comm_t* communicator )
164 {
165 if( communicator == NULL ) return;
166 }
167 void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads)
168 {
169 if( communicator == NULL ) return;
170 communicator->sent_object = NULL;
171 communicator->n_threads = n_threads;
172 communicator->barrier_sense = 0;
173 communicator->barrier_threads_arrived = 0;
174 }
175 //barrier routine taken from art of multicore programming or something
176 void bli_barrier( thread_comm_t* communicator, dim_t t_id )
177 {
178 //printf("bli_barrier\n");
179 #ifdef BLIS_ENABLE_OPENMP
180 if(communicator == NULL || communicator->n_threads == 1)
181 return;
182 bool_t my_sense = communicator->barrier_sense;
183 dim_t my_threads_arrived;
185 #ifdef BLIS_ENABLE_C66X_BUILD
186 _Pragma("omp atomic")
187 my_threads_arrived = ++(communicator->barrier_threads_arrived);
188 if( my_threads_arrived == communicator->n_threads ) {
189 communicator->barrier_threads_arrived = 0;
190 communicator->barrier_sense = !communicator->barrier_sense;
191 }
192 else {
193 volatile bool_t* listener = &communicator->barrier_sense;
194 while( *listener == my_sense )
195 {
196 CACHE_invL1d(communicator,sizeof(thread_comm_t),CACHE_WAIT);
197 CACHE_invL2(communicator,sizeof(thread_comm_t),CACHE_WAIT);
198 }
199 }
200 #else //---BLIS_ENABLE_C66X_BUILD
201 _Pragma("omp atomic capture")
202 my_threads_arrived = ++(communicator->barrier_threads_arrived);
203 if( my_threads_arrived == communicator->n_threads ) {
204 communicator->barrier_threads_arrived = 0;
205 communicator->barrier_sense = !communicator->barrier_sense;
206 }
207 else {
208 volatile bool_t* listener = &communicator->barrier_sense;
209 while( *listener == my_sense ) { }
210 }
211 #endif //---BLIS_ENABLE_C66X_BUILD
213 #else
214 return;
215 #endif
216 }
217 #endif
219 void bli_free_communicator( thread_comm_t* communicator )
220 {
221 if( communicator == NULL ) return;
222 bli_cleanup_communicator( communicator );
223 bli_free( communicator );
224 }
226 thread_comm_t* bli_create_communicator( dim_t n_threads )
227 {
228 #ifdef BLIS_ENABLE_C66X_BUILD
229 thread_comm_t* comm;
230 /* Making sure the communicator address and size is aligned to the cache line. This
231 * makes sure that only the communicator is invalidated during bli_barrier. Added
232 * bli_memalign API for C66 that calls memalign instead of malloc.
233 */
234 siz_t communicator_size;
235 communicator_size = BLIS_CACHE_LINE_SIZE*(sizeof(thread_comm_t)/BLIS_CACHE_LINE_SIZE +1);
236 comm = (thread_comm_t*) bli_memalign(BLIS_CACHE_LINE_SIZE, communicator_size );
237 #else
238 thread_comm_t* comm = (thread_comm_t*) bli_malloc( sizeof(thread_comm_t) );
239 #endif
240 bli_setup_communicator( comm, n_threads );
241 return comm;
242 }
244 void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_send )
245 {
246 if( communicator == NULL || communicator->n_threads == 1 ) return to_send;
248 if( id == 0 ) communicator->sent_object = to_send;
250 bli_barrier( communicator, id );
251 void * object = communicator->sent_object;
252 bli_barrier( communicator, id );
254 return object;
255 }
257 thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
258 dim_t n_way, dim_t work_id )
259 {
261 thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) );
262 bli_setup_thread_info( thr, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
263 return thr;
264 }
266 void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
267 dim_t n_way, dim_t work_id )
268 {
269 thr->ocomm = ocomm;
270 thr->ocomm_id = ocomm_id;
271 thr->icomm = icomm;
272 thr->icomm_id = icomm_id;
274 thr->n_way = n_way;
275 thr->work_id = work_id;
276 }
278 void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
279 {
280 thrinfo_t* thread = (thrinfo_t*) thr;
281 dim_t n_way = thread->n_way;
282 dim_t work_id = thread->work_id;
284 dim_t size = all_end - all_start;
285 dim_t n_pt = size / n_way;
286 n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
287 n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor);
288 *start = work_id * n_pt + all_start;
289 *end = bli_min( *start + n_pt, size + all_start );
290 }
292 void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
293 {
294 thrinfo_t* thread = (thrinfo_t*) thr;
295 dim_t n_way = thread->n_way;
296 dim_t work_id = thread->work_id;
297 dim_t size = all_end - all_start;
298 double num;
300 *start = 0;
301 *end = all_end - all_start;
302 num = size*size / (double) n_way; // 2xArea per thread?
304 if( forward ) {
305 dim_t curr_caucus = n_way - 1;
306 dim_t len = 0;
307 while(1){
308 dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
309 width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
310 if( curr_caucus == work_id ) {
311 *start = bli_max( 0 , *end - width ) + all_start;
312 *end = *end + all_start;
313 return;
314 }
315 else{
316 *end -= width;
317 len += width;
318 curr_caucus--;
319 }
320 }
321 }
322 else{
323 while(1){
324 dim_t width = ceil(sqrt(*start * *start + num)) - *start;
325 width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
327 if( work_id == 0 ) {
328 *start = *start + all_start;
329 *end = bli_min( *start + width, all_end );
330 return;
331 }
332 else{
333 *start = *start + width;
334 }
335 work_id--;
336 }
337 }
338 }
340 void bli_level3_thread_decorator( dim_t n_threads,
341 level3_int_t func,
342 obj_t* alpha,
343 obj_t* a,
344 obj_t* b,
345 obj_t* beta,
346 obj_t* c,
347 void* cntl,
348 void** thread )
349 {
350 #ifdef BLIS_ENABLE_OPENMP
351 _Pragma( "omp parallel num_threads(n_threads)" )
352 {
353 dim_t omp_id = omp_get_thread_num();
355 func( alpha,
356 a,
357 b,
358 beta,
359 c,
360 cntl,
361 thread[omp_id] );
362 }
363 #else
364 func( alpha,
365 a,
366 b,
367 beta,
368 c,
369 cntl,
370 thread[0] );
371 #endif
372 }
374 dim_t bli_read_nway_from_env( char* env )
375 {
376 dim_t number = 1;
377 #ifdef BLIS_ENABLE_C66X_BUILD
378 if(strcmp(env,"BLIS_JC_NT")==0)
379 number = 1;
380 if(strcmp(env,"BLIS_IC_NT")==0)
381 number = 8;
382 if(strcmp(env,"BLIS_JR_NT")==0)
383 number = 1;
384 if(strcmp(env,"BLIS_IR_NT")==0)
385 number = 1;
386 return number;
387 #else
388 char* str = getenv( env );
389 if( str != NULL )
390 {
391 number = strtol( str, NULL, 10 );
392 }
393 return number;
394 #endif
395 }
397 dim_t bli_gcd( dim_t x, dim_t y )
398 {
399 while( y != 0 ) {
400 dim_t t = y;
401 y = x % y;
402 x = t;
403 }
404 return x;
405 }
407 dim_t bli_lcm( dim_t x, dim_t y)
408 {
409 return x * y / bli_gcd( x, y );
410 }