]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - dense-linear-algebra-libraries/linalg.git/blob - blis/frame/base/bli_threading.c
Consolidate all git repos of linalg into one.
[dense-linear-algebra-libraries/linalg.git] / blis / frame / base / bli_threading.c
1 /*
3    BLIS    
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
7    Copyright (C) 2014, The University of Texas at Austin
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #ifdef BLIS_TREE_BARRIER
38 void bli_free_barrier_tree( barrier_t* barrier )
39 {
40     if( barrier == NULL )
41         return;
42     barrier->count--;
43     if( barrier->count == 0 )
44     {
45         bli_free_barrier_tree( barrier->dad );
46         bli_free( barrier );
47     }
48     return;
49 }
50 barrier_t* bli_create_tree_barrier(int num_threads, int arity, barrier_t** leaves, int leaf_index)
51 {
52     barrier_t* me = (barrier_t*) bli_malloc(sizeof(barrier_t));
54     me->dad = NULL;
55     me->signal = 0;
57     // Base Case
58     if( num_threads <= arity ) {
59        //Now must be registered as a leaf
60        for(int i = 0; i < num_threads; i++)
61        {
62            leaves[leaf_index + i] = me;
63        }
64        me->count = num_threads;
65        me->arity = num_threads;
66     }
67     else {
68         // Otherwise this node has children
69         int threads_per_kid = num_threads / arity;
70         int defecit = num_threads - threads_per_kid * arity;
72         for(int i = 0; i < arity; i++){
73             int threads_this_kid = threads_per_kid;
74             if(i < defecit) threads_this_kid++;
75             
76             barrier_t* kid = bli_create_tree_barrier(threads_this_kid, arity, leaves, leaf_index);
77             kid->dad = me;
79             leaf_index += threads_this_kid;
80         }  
81         me->count = arity;
82         me->arity = arity;
83     }  
85     return me;
86 }
88 void bli_cleanup_communicator( thread_comm_t* communicator )
89 {
90     if( communicator == NULL ) return;
91     for( dim_t i = 0; i < communicator->n_threads; i++)
92     {
93        bli_free_barrier_tree( communicator->barriers[i] );
94     }
95     bli_free( communicator->barriers );
96 }
97 void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads)
98 {
99     if( communicator == NULL ) return;
100     communicator->sent_object = NULL;
101     communicator->n_threads = n_threads;
102     communicator->barriers = ( barrier_t** ) bli_malloc( sizeof( barrier_t* ) * n_threads );
103     bli_create_tree_barrier( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 );
106 void tree_barrier( barrier_t* barack )
108 #ifdef BLIS_ENABLE_OPENMP
109     int my_signal = barack->signal;
110     int my_count;
112 #ifdef BLIS_ENABLE_C66X_BUILD
113     _Pragma("omp atomic")
114     if(communicator == NULL || communicator->n_threads == 1)
115         return;
116     bool_t my_sense = communicator->barrier_sense;
117     dim_t my_threads_arrived;
119         _Pragma("omp atomic")
120         my_threads_arrived = ++(communicator->barrier_threads_arrived);
122     if( my_threads_arrived == communicator->n_threads ) {
123         communicator->barrier_threads_arrived = 0;
124         communicator->barrier_sense = !communicator->barrier_sense;
125     }
126     else {
127         volatile bool_t* listener = &communicator->barrier_sense;
128         while( *listener == my_sense ) {
129                 CACHE_invL1d(communicator,sizeof(thread_comm_t),CACHE_WAIT);
130                 CACHE_invL2(communicator,sizeof(thread_comm_t),CACHE_WAIT);
131         }
132     }
133 #else
134     _Pragma("omp atomic capture")
135     my_count = barack->count--;
137     if( my_count == 1 ) {
138         if(  barack->dad != NULL ) {
139                 tree_barrier( barack->dad );
140         }
141         barack->count = barack->arity;
142         barack->signal = !barack->signal;
143     }
144     else {
145         volatile int* listener = &barack->signal;
146         while( *listener == my_signal ) {
147         }
148     }
149 #endif
150 #else
151     return
152 #endif
155 void bli_barrier( thread_comm_t* comm, dim_t t_id )
157     tree_barrier( comm->barriers[t_id] );
160 //-----------end of #ifdef BLIS_TREE_BARRIER
161 #else
163 void bli_cleanup_communicator( thread_comm_t* communicator )
165     if( communicator == NULL ) return;
167 void bli_setup_communicator( thread_comm_t* communicator, dim_t n_threads)
169     if( communicator == NULL ) return;
170     communicator->sent_object = NULL;
171     communicator->n_threads = n_threads;
172     communicator->barrier_sense = 0;
173     communicator->barrier_threads_arrived = 0;
175 //barrier routine taken from art of multicore programming or something
176 void bli_barrier( thread_comm_t* communicator, dim_t t_id )
178         //printf("bli_barrier\n");
179 #ifdef BLIS_ENABLE_OPENMP
180     if(communicator == NULL || communicator->n_threads == 1)
181         return;
182     bool_t my_sense = communicator->barrier_sense;
183     dim_t my_threads_arrived;
185 #ifdef BLIS_ENABLE_C66X_BUILD
186     _Pragma("omp atomic")
187         my_threads_arrived = ++(communicator->barrier_threads_arrived);
188     if( my_threads_arrived == communicator->n_threads ) {
189         communicator->barrier_threads_arrived = 0;
190         communicator->barrier_sense = !communicator->barrier_sense;
191     }
192     else {
193         volatile bool_t* listener = &communicator->barrier_sense;
194         while( *listener == my_sense )
195         {
196                 CACHE_invL1d(communicator,sizeof(thread_comm_t),CACHE_WAIT);
197                 CACHE_invL2(communicator,sizeof(thread_comm_t),CACHE_WAIT);
198         }
199     }
200 #else //---BLIS_ENABLE_C66X_BUILD
201     _Pragma("omp atomic capture")
202     my_threads_arrived = ++(communicator->barrier_threads_arrived);
203     if( my_threads_arrived == communicator->n_threads ) {
204         communicator->barrier_threads_arrived = 0;
205         communicator->barrier_sense = !communicator->barrier_sense;
206     }
207     else {
208         volatile bool_t* listener = &communicator->barrier_sense;
209         while( *listener == my_sense ) { }
210     }
211 #endif //---BLIS_ENABLE_C66X_BUILD
213 #else
214     return;
215 #endif
217 #endif
219 void bli_free_communicator( thread_comm_t* communicator )
221     if( communicator == NULL ) return;
222     bli_cleanup_communicator( communicator );
223     bli_free( communicator );
226 thread_comm_t* bli_create_communicator( dim_t n_threads )
228 #ifdef BLIS_ENABLE_C66X_BUILD
229         thread_comm_t* comm;
230     /* Making sure the communicator address and size is aligned to the cache line. This
231      * makes sure that only the communicator is invalidated during bli_barrier. Added
232      * bli_memalign API for C66 that calls memalign instead of malloc.
233      */
234     siz_t communicator_size;
235     communicator_size = BLIS_CACHE_LINE_SIZE*(sizeof(thread_comm_t)/BLIS_CACHE_LINE_SIZE +1);
236     comm = (thread_comm_t*) bli_memalign(BLIS_CACHE_LINE_SIZE, communicator_size );
237 #else
238     thread_comm_t* comm = (thread_comm_t*) bli_malloc( sizeof(thread_comm_t) );
239 #endif
240     bli_setup_communicator( comm, n_threads );
241     return comm;
244 void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_send )
245 {   
246     if( communicator == NULL || communicator->n_threads == 1 ) return to_send;
248     if( id == 0 ) communicator->sent_object = to_send;
250     bli_barrier( communicator, id );
251     void * object = communicator->sent_object;
252     bli_barrier( communicator, id );
254     return object;
257 thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
258                              dim_t n_way, dim_t work_id )
261         thrinfo_t* thr = (thrinfo_t*) bli_malloc( sizeof(thrinfo_t) );
262         bli_setup_thread_info( thr, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
263         return thr;
266 void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
267                              dim_t n_way, dim_t work_id )
269         thr->ocomm = ocomm;
270         thr->ocomm_id = ocomm_id;
271         thr->icomm = icomm;
272         thr->icomm_id = icomm_id;
274         thr->n_way = n_way;
275         thr->work_id = work_id;
278 void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
280     thrinfo_t* thread = (thrinfo_t*) thr;
281     dim_t n_way = thread->n_way;
282     dim_t work_id = thread->work_id;
284     dim_t size = all_end - all_start;
285     dim_t n_pt = size / n_way;
286     n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
287     n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); 
288     *start = work_id * n_pt + all_start;
289     *end   = bli_min( *start + n_pt, size + all_start );
292 void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
294     thrinfo_t* thread = (thrinfo_t*) thr;
295     dim_t n_way = thread->n_way;
296     dim_t work_id = thread->work_id;
297     dim_t size = all_end - all_start;
298     double num;
300     *start = 0;
301     *end   = all_end - all_start;
302     num = size*size / (double) n_way; // 2xArea per thread?
304     if( forward ) {
305         dim_t curr_caucus = n_way - 1;
306         dim_t len = 0;
307         while(1){
308             dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
309             width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
310             if( curr_caucus == work_id ) {
311                 *start = bli_max( 0 , *end - width ) + all_start;
312                 *end = *end + all_start;
313                 return;
314             }
315             else{
316                 *end -= width;
317                 len += width;
318                 curr_caucus--;
319             }
320         }
321     }
322     else{
323         while(1){
324             dim_t width = ceil(sqrt(*start * *start + num)) - *start;
325             width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
327             if( work_id == 0 ) {
328                 *start = *start + all_start;
329                 *end = bli_min( *start + width, all_end );
330                 return;
331             }
332             else{
333                 *start = *start + width;
334             }
335             work_id--;
336         }
337     }
340 void bli_level3_thread_decorator( dim_t n_threads, 
341                                   level3_int_t func, 
342                                   obj_t* alpha, 
343                                   obj_t* a, 
344                                   obj_t* b, 
345                                   obj_t* beta, 
346                                   obj_t* c, 
347                                   void* cntl, 
348                                   void** thread )
350 #ifdef BLIS_ENABLE_OPENMP
351     _Pragma( "omp parallel num_threads(n_threads)" )
352     {
353         dim_t omp_id = omp_get_thread_num();
355         func( alpha,
356                   a,
357                   b,
358                   beta,
359                   c,
360                   cntl,
361                   thread[omp_id] );
362     }
363 #else
364         func( alpha,
365                   a,
366                   b,
367                   beta,
368                   c,
369                   cntl,
370                   thread[0] );
371 #endif
374 dim_t bli_read_nway_from_env( char* env )
376     dim_t number = 1;
377 #ifdef BLIS_ENABLE_C66X_BUILD
378         if(strcmp(env,"BLIS_JC_NT")==0)
379                 number = 1;
380         if(strcmp(env,"BLIS_IC_NT")==0)
381                 number = 8;
382         if(strcmp(env,"BLIS_JR_NT")==0)
383                 number = 1;
384         if(strcmp(env,"BLIS_IR_NT")==0)
385                 number = 1;
386         return number;
387 #else
388     char* str = getenv( env );
389     if( str != NULL )
390     {   
391         number = strtol( str, NULL, 10 );
392     }   
393     return number;
394 #endif
397 dim_t bli_gcd( dim_t x, dim_t y )
399     while( y != 0 ) {
400         dim_t t = y;
401         y = x % y;
402         x = t;
403     }
404     return x;
407 dim_t bli_lcm( dim_t x, dim_t y)
409     return x * y / bli_gcd( x, y );