1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34 #include "blis.h"
36 #if defined(BLIS_ENABLE_C66X_OPENCL)
37 int ti_printf(FILE* _fp, const char *_format, ...)
38 {
39 va_list argptr;
40 va_start(argptr,_format);
41 printf(_format, argptr);
42 va_end(argptr);
43 return 0;
44 }
45 #endif
47 #if defined(BLIS_ENABLE_C66X_BUILD) && defined(BLIS_ENABLE_C66X_MEM_POOLS) && defined(BLIS_ENABLE_C66X_EDMA)
48 //#define BLIS_EDMA_DEBUG
49 //#define BLIS_ENABLE_CYCLE_COUNT
50 static dma_t dma_pools[BLIS_MAX_NUM_THREADS]; // One pool for each core.
52 static lib_emt_Handle pool_coreX_emt_handles[BLIS_MAX_NUM_THREADS][BLIS_C66X_EDMA_MAX_NUM_CHANNELS];
54 //for initilization during encoding the dma control leaf
55 dmam_t* bli_dmam_cntl_obj_create( impl_t impl_type,
56 varnum_t var_num,
57 blksz_t* mc,
58 blksz_t* nc,
59 membuf_t dma_buf_type )
60 {
61 dmam_t* cntl;
63 cntl = ( dmam_t* ) bli_malloc( sizeof(dmam_t) );
65 cntl->impl_type = impl_type;
66 cntl->var_num = var_num;
67 cntl->mc = mc;
68 cntl->nc = nc;
69 cntl->dma_buf_type = dma_buf_type;
71 return cntl;
72 }
74 void bli_dma_init()
75 {
76 // Create threads so that initialization is done on each core.
77 _Pragma( "omp parallel num_threads(BLIS_MAX_NUM_THREADS)" )
78 {
79 gint_t status; //int32_t
80 status = lib_emt_init();
82 if(status != LIB_EMT_SUCCESS)
83 {
84 printf("Core %d DMA not initialized\n", lib_get_coreID());
85 exit(1);
86 }
87 bli_dma_init_pool(status, BLIS_C66X_EDMA_MAX_NUM_CHANNELS, pool_coreX_emt_handles[lib_get_coreID()], &dma_pools[lib_get_coreID()]);
88 }
89 }
91 void bli_dma_init_pool(gint_t emt_status,
92 gint_t num_channels,
93 lib_emt_Handle* pool_emt_handles,
94 dma_t* dma_pool)
95 {
96 dim_t i;
97 lib_emt_Handle temp_handle;
98 if(emt_status != LIB_EMT_SUCCESS)
99 {
100 dma_pool->emt_status = FALSE;
101 printf("Core %d DMA not initialized\n", lib_get_coreID());
102 return;
103 }
104 dma_pool->emt_status = TRUE;
106 for(i = 0; i < num_channels; i++)
107 {
108 //pool_emt_handles[i] = lib_emt_alloc(1);
109 temp_handle = lib_emt_alloc(1);
110 if(temp_handle == NULL)
111 {
112 printf("Failed to alloc edma handle CoreID %d\n", lib_get_coreID());
113 return;
114 }
116 pool_emt_handles[i] = temp_handle;
117 }
118 dma_pool->num_channels = num_channels;
119 dma_pool->emt_handle = pool_emt_handles;
120 dma_pool->top_index = num_channels-1;
121 #ifdef BLIS_EDMA_DEBUG
122 printf("Core ID %d, Dma pool top index %d, num channels = %d\n",lib_get_coreID(), dma_pool->top_index, dma_pool->num_channels);
123 #endif
124 }
126 void bli_dma_channel_acquire(lib_emt_Handle* emt_handle, dim_t core_id)
127 {
128 dma_t* dma_pool;
129 lib_emt_Handle* emt_handle_ptrs;
130 dim_t i;
132 dma_pool = &dma_pools[core_id];
134 // Check if channels are still available
135 if(dma_pool->top_index == -1)
136 {
137 printf("DMA pool exhausted\n");
138 return;
139 }
141 // Get all the handles of DMA pool
142 emt_handle_ptrs = dma_pool->emt_handle;
144 //Get index of the top most available handle
145 i = dma_pool->top_index;
147 //Get edma handle
148 *emt_handle = emt_handle_ptrs[i];
149 #ifdef BLIS_EDMA_DEBUG
150 printf("Acquiring DMA handle, top index %d edma handle %x %x\n", i, *emt_handle, emt_handle_ptrs[i]);
151 #endif
152 // Decrement the index so that it now points to the next available handle.
153 dma_pool->top_index--;
155 }
157 void bli_dma_channel_release(lib_emt_Handle emt_handle, dim_t core_id)
158 {
159 dma_t* dma_pool;
160 lib_emt_Handle* emt_handle_ptrs;
161 dim_t i;
163 if(emt_handle == NULL)
164 {
165 printf("nothing to release\n");
166 return;
167 }
168 dma_pool = &dma_pools[core_id];
170 // Get all the handles of DMA pool
171 emt_handle_ptrs = dma_pool->emt_handle;
173 // Increment the index so that it now points to the next available handle.
174 dma_pool->top_index++;
176 //Get this new top index
177 i = dma_pool->top_index;
179 //Place the edma handle back onto the top of the dma pool.
180 // This is done so that if handles were release not in the same order
181 // that they were acquired, the next time a handle is acquired it gets the latest released one.
182 emt_handle_ptrs[i] = emt_handle;
184 #ifdef BLIS_EDMA_DEBUG
185 printf("Released DMA handle, top index %d edma handle %x \n", i, emt_handle_ptrs[i]);
186 #endif
187 }
189 void bli_dmam_init( obj_t* a,
190 obj_t* p,
191 dmam_t* cntl)
192 {
193 dim_t m_root, n_root;
194 dim_t m_p, n_p;
195 dim_t m_transfer, n_transfer;
196 struc_t struc_p;
197 uplo_t uplo_p;
198 doff_t diagoff_p;
200 mem_t* dmamem_a; //source mem_t
201 mem_t* dmamem_p; // destination mem_t
202 siz_t size_p; //Size in bytes of the destination buffer. Will this be same as the source?
203 membuf_t dma_buf_type;
205 inc_t rs_a, cs_a;
206 inc_t rs_p, cs_p;
207 inc_t offm_p, offn_p;
208 inc_t offm_a, offn_a;
210 siz_t elem_size;
212 void* buf; //Will point to the data buffer within destination
215 if ( bli_error_checking_is_enabled() )
216 bli_dmam_init_check( a, p, cntl ); //Need to add functionality
218 // First check if we are to skip this operation because the control tree
219 // is NULL, and if so, simply alias the object to its destination counterpart.
220 if ( cntl_is_noop( cntl ) )
221 {
222 bli_obj_alias_to( *a, *p );
223 return;
224 }
226 if ( bli_obj_is_zeros( *a ) )
227 {
228 bli_obj_alias_for_dma( *a, *p );
229 return;
230 }
232 // Get dma_mem field from the source
233 dmamem_a = bli_obj_dma_mem( *a );
235 // Check if source object is already in destination buffer type. If so alias,
236 // simply alias the object to its destination counterpart.
237 if ( dmamem_a->buf_type == cntl->dma_buf_type)
238 {
239 bli_obj_alias_to(*a, *p);
240 return;
241 }
244 // If DMA needs to occur, copy the basic fields of the object.
245 bli_obj_alias_for_dma( *a, *p );
247 rs_a = bli_obj_row_stride(*a);
248 cs_a = bli_obj_col_stride(*a);
250 m_root = bli_obj_length( *(bli_obj_root( *a )) );
251 n_root = bli_obj_width( *(bli_obj_root( *a )) );
253 offm_a = bli_obj_row_offset( *a );
254 offn_a = bli_obj_col_offset( *a );
256 m_p = bli_obj_length( *p );
257 n_p = bli_obj_width( *p );
258 diagoff_p = bli_obj_diag_offset( *p );
259 struc_p = bli_obj_struc( *p );
260 uplo_p = bli_obj_uplo( *p );
264 elem_size = p->elem_size;
266 /* If the partition is not a general and if it intersects the diagonal, then we
267 * must transfer rows/columns that are outside the partition so that the packing
268 * routine can "densify" or symmetrize the panel.
269 */
270 if(!bli_obj_root_is_general( *p ) && bli_obj_intersects_diag( *p ) && !(bli_is_triangular( struc_p )))
271 {
272 //printf("not general and diag intersects\n");
273 if(bli_is_herm_or_symm( struc_p ))
274 {
275 if(bli_is_lower(uplo_p))
276 {
277 //printf("lower diagoff %d m_p %d, n_p %d offm_a %d, offn_a %d\n", diagoff_p, m_p, n_p, offm_a, offn_a);
278 m_transfer = bli_max(m_p, n_p - diagoff_p);
279 n_transfer = bli_max(m_p, bli_min (n_p, m_p + diagoff_p ));
281 if (offm_a + m_transfer > m_root)
282 m_transfer = m_root - offm_a;
284 offm_p = 0;
285 if(diagoff_p < 0 )
286 offn_p = -diagoff_p;
287 else
288 offn_p = 0;
290 if (bli_is_col_stored( rs_a , cs_a ))
291 {
292 rs_p = 1;
293 cs_p = m_transfer;
294 }
295 else if (bli_is_row_stored( rs_a , cs_a ))
296 {
297 rs_p = n_transfer;
298 cs_p = 1;
299 }
300 }
301 else if (bli_is_upper(uplo_p))
302 {
303 //printf("upper diagoff %d\n", diagoff_p);
304 m_transfer = bli_max(m_p, m_p + diagoff_p);
305 n_transfer = bli_max(m_p, bli_min (n_p, n_p - diagoff_p ));
307 if (offn_a + n_transfer > n_root)
308 n_transfer = n_root - offn_a;
310 //printf("upper diagoff %d m_p %d, n_p %d offm_a %d, offn_a %d\n", diagoff_p, m_p, n_p, offm_a, offn_a);
312 if(diagoff_p > 0 )
313 {
314 offm_p = diagoff_p;
315 offn_p = -diagoff_p; // Move to the right
316 }
317 else
318 {
319 offm_p = 0;
320 offn_p = 0;
321 }
323 if (bli_is_col_stored( rs_a , cs_a ))
324 {
325 rs_p = 1;
326 cs_p = m_transfer;
327 }
328 else if (bli_is_row_stored( rs_a , cs_a ))
329 {
330 rs_p = n_transfer;
331 cs_p = 1;
332 }
334 }
335 }
336 }
337 else
338 {
339 offm_p = 0;
340 offn_p = 0;
342 m_transfer = m_p;
343 n_transfer = n_p;
346 if (bli_is_col_stored( rs_a , cs_a ))
347 {
348 rs_p = 1;
349 cs_p = m_p;
350 }
351 else if (bli_is_row_stored( rs_a , cs_a ))
352 {
353 rs_p = n_p;
354 cs_p = 1;
355 }
356 }
358 bli_obj_set_incs( rs_p, cs_p, *p );
359 bli_obj_set_offs( offm_p, offn_p, *p );
361 size_p = m_transfer * n_transfer * elem_size;
363 //printf("m_p %d n_p %d m_trans %d, n_trans %d size %d\n",m_p, n_p, m_transfer, n_transfer, size_p);
365 dmamem_p = bli_obj_dma_mem(*p);
366 dma_buf_type = cntl->dma_buf_type;
368 if ( bli_mem_is_unalloc( dmamem_p ) )
369 {
370 // If the mem_t object of p has not yet been allocated, then acquire
371 // a memory block of type pack_buf_type.
372 // Will need to change this to allocate memory from the correct location
373 bli_mem_acquire_m( size_p,
374 dma_buf_type,
375 dmamem_p );
376 }
377 else
378 {
379 // If the mem_t object is currently allocated and smaller than is
380 // needed, then it must have been allocated for a different type
381 // of object (a different pack_buf_type value), so we must first
382 // release it and then re-acquire it using the new size and new
383 // pack_buf_type value.
384 if ( bli_mem_size( dmamem_p ) < size_p )
385 {
386 bli_mem_release( dmamem_p );
387 bli_mem_acquire_m( size_p,
388 dma_buf_type,
389 dmamem_p );
390 }
391 }
394 //Set buffer in obj_t to point to the buffer in dma_mem;
396 buf = bli_mem_buffer( dmamem_p );
397 bli_obj_set_buffer( buf, *p );
399 //If definition does not have an EDMA channel, then acquire a channel from the pool
400 if(p->emt_handle == NULL)
401 {
402 bli_dma_channel_acquire(&(p->emt_handle), lib_get_coreID());
403 if(p->emt_handle == NULL)
404 printf("DMAM_INIT Failed to alloc edma handle CoreID %d %x\n", lib_get_coreID(), p->emt_handle);
405 }
406 }
408 void bli_dmam_init_check(obj_t* a,
409 obj_t* p,
410 dmam_t* cntl)
411 {
415 }
417 void bli_dmam_int( obj_t* a,
418 obj_t* p,
419 dmam_t* cntl,
420 dmam_thrinfo_t* thread )
421 {
422 mem_t* dmamem_a; //source mem_t
424 // First check if we are to skip this operation because the control tree
425 // is NULL. We return without taking any action because a was already
426 // aliased to p in dmam_init().
427 if ( cntl_is_noop( cntl ) )
428 {
429 return;
430 }
432 if ( bli_obj_is_zeros( *a ) )
433 {
434 return;
435 }
438 // Get dma_mem field from the source
439 dmamem_a = bli_obj_dma_mem( *a );
441 // Check if source object is already in destination buffer type. If so the destination
442 // has already been aliased during the initialization, so can return.
443 if ( dmamem_a->buf_type == cntl->dma_buf_type)
444 {
445 return;
446 }
448 bli_dma_var1(a, p, thread);
450 }
452 void bli_dma_var1( obj_t* a,
453 obj_t* p,
454 dmam_thrinfo_t* thread )
455 {
456 dim_t m_root, n_root;
457 dim_t m_p, n_p;
458 dim_t m_transfer, n_transfer;
459 struc_t struc_p;
460 uplo_t uplo_p;
461 doff_t diagoff_p;
462 dim_t offm_p;
463 dim_t offn_p;
465 dim_t offm_a, offn_a;
467 inc_t cs_source;
468 inc_t rs_source;
470 inc_t cs_dest;
471 inc_t rs_dest;
473 dim_t num_iter = 0; // # of iterations of memcpy will be required.
474 dim_t iter; //loop counter for iterations
476 dim_t ld_source = 0; // leading dimension of the source.
477 //Indicates the number of elements to be jumped to reach the next row/column.
478 // This value will be the (row/column stride)*size of element;
479 dim_t ld_dest = 0; // leading dimension of the destination.
481 siz_t elem_size;
482 siz_t elem_move = 0; // number of bytes to be moved during the memcpy
484 void* ptr_source;
485 void* ptr_dest;
487 #ifdef BLIS_ENABLE_CYCLE_COUNT
488 volatile int counter_start;
489 volatile int counter_end;
490 #endif
492 m_root = bli_obj_length( *(bli_obj_root( *a )) );
493 n_root = bli_obj_width( *(bli_obj_root( *a )) );
495 offm_a = bli_obj_row_offset( *a );
496 offn_a = bli_obj_col_offset( *a );
498 m_p = bli_obj_length( *p );
499 n_p = bli_obj_width( *p );
501 cs_source = bli_obj_col_stride( *a ) ;
502 rs_source = bli_obj_row_stride( *a ) ;
504 cs_dest = bli_obj_col_stride( *p ) ;
505 rs_dest = bli_obj_row_stride( *p ) ;
507 elem_size = bli_obj_elem_size( *a );
509 struc_p = bli_obj_struc( *p );
510 uplo_p = bli_obj_uplo( *p );
511 diagoff_p = bli_obj_diag_offset( *p );
513 if(!bli_obj_root_is_general( *p ) && bli_obj_intersects_diag( *p ) && !(bli_is_triangular( struc_p )))
514 {
515 //printf("not general and diag intersects\n");
516 if(bli_is_herm_or_symm( struc_p ))
517 {
518 if(bli_is_lower(uplo_p))
519 {
520 m_transfer = bli_max (m_p, n_p - diagoff_p);
521 n_transfer = bli_max(m_p, bli_min (n_p, m_p + diagoff_p ));
522 if (offm_a + m_transfer > m_root)
523 m_transfer = m_root - offm_a;
524 }
525 else if (bli_is_upper(uplo_p))
526 {
527 m_transfer = bli_max(m_p, m_p + diagoff_p);
528 n_transfer = bli_max(m_p, bli_min (n_p, n_p - diagoff_p ));
530 if (offn_a + n_transfer > n_root)
531 n_transfer = n_root - offn_a;
532 }
533 }
534 }
535 else
536 {
537 m_transfer = m_p;
538 n_transfer = n_p;
539 }
542 //If source is column major
543 if( bli_is_col_stored( rs_source, cs_source ) )
544 {
545 //If the source is stored in column major, then the number of iterations of
546 // memcpy will equal to the number of columns, because each column will be copied
547 // at once in each iteration.
548 //num_iter = bli_obj_width(*a);
549 num_iter = n_transfer;
551 // the source.buffer is a void pointer. Hence, when jumping to the next column we
552 // have to calculate how many bytes and not elements the pointer needs to move
553 // to get to the next column
554 ld_source = cs_source * elem_size;
555 ld_dest = cs_dest * elem_size;
557 // for column-major, each column will be moved at a time.
558 // Therefore, the number of bytes to be moved will be
559 // equal to the number of rows in each column multiplied
560 // by the size of each element.
561 //elem_move = bli_obj_length(*a) * elem_size;
563 // equal to the column stride of the destination. (In case
564 // of symmetric or Hermitian matrices we may be transferring more rows
565 // than the length of the destination.
566 elem_move = cs_dest * elem_size;
569 }
570 //if source is row-major stored
571 else if( bli_is_row_stored( rs_source, cs_source ) )
572 {
573 //If the source is stored in row major, then the number of iterations of
574 // memcpy will equal to the number of rows, because each row will be copied
575 // at once in each iteration.
576 //num_iter = bli_obj_length(*a);
577 num_iter = m_transfer;
579 // the source.buffer is a void pointer. Hence, when jumping to the next row we
580 // have to calculate how many bytes and not elements the pointer needs to move
581 // to get to the next row
582 ld_source = rs_source * elem_size;
583 ld_dest = rs_dest * elem_size;
585 // for row-major, each row will be moved at a time.
586 // Therefore, the number of bytes to be moved will be
587 // equal to the number of columns in each row multiplied
588 // by the size of each element.
589 //elem_move = bli_obj_width(*a) * elem_size;
591 // equal to the row stride of the destination. (In case
592 // of symmetric or Hermitian matrices we may be transferring more rows
593 // than the length of the destination.)
594 elem_move = rs_dest * elem_size;
596 }
598 offm_p = bli_obj_row_offset( *p );
599 offn_p = bli_obj_col_offset( *p );
602 ptr_source = bli_obj_buffer_at_off( *a );
603 ptr_dest = bli_obj_buffer_at_off( *p );
605 //printf("prt_s %x ptr_dest %x offm_p %d offn_p %d \n", ptr_source, ptr_dest, offm_p, offn_p );
606 //printf("num_iter = %d, elem_move = %d, m_trans = %d, n_trans = %d", num_iter, elem_move, m_transfer, n_transfer);
608 if(offm_p != 0 || offn_p != 0)
609 {
610 ptr_source = (void *) (( char* ) ptr_source - ( dim_t ) elem_size * ( offn_p * cs_source + offm_p * rs_source));
611 ptr_dest = (void *) (( char* ) ptr_dest - ( dim_t ) elem_size * ( offn_p * cs_dest + offm_p * rs_dest));
613 }
615 #ifdef BLIS_ENABLE_CYCLE_COUNT
616 TSCL = 0;
617 counter_start = TSCL;
618 #endif
619 #ifdef BLIS_ENABLE_CYCLE_COUNT
620 counter_end = TSCL;
621 printf("Cache invalidate %d \n", counter_end-counter_start);
622 #endif
624 if(thread->work_id == 0)
625 {
626 // ld_source is already in terms of bytes
627 if(ld_source < BLIS_C66X_MAXDMASTRIDE && p->emt_handle != NULL)
628 {
629 int status = -100;
630 #ifdef BLIS_ENABLE_CYCLE_COUNT
631 TSCL = 0;
632 counter_start = TSCL;
633 #endif
634 // The destination object contains the EDMA handle
635 status = lib_emt_copy2D2D ( p->emt_handle,
636 ptr_source,
637 ptr_dest,
638 elem_move,
639 num_iter,
640 ld_source,
641 ld_dest
642 );
643 #ifdef BLIS_ENABLE_CYCLE_COUNT
644 counter_end = TSCL;
645 printf("DMA start %d \n", counter_end-counter_start);
646 #endif
649 if(status != LIB_EMT_SUCCESS)
650 printf("DMA Transfer Error %d \n",status);
651 }
652 else // cannot use DMA since stride is only 16 bit signed
653 {
654 for(iter = 0; iter < num_iter; iter++)
655 {
656 memcpy(ptr_dest, ptr_source, elem_move);
658 ptr_source = (void *) ( (char *) ptr_source + ld_source);
659 ptr_dest = (void *) ( (char *) ptr_dest + ld_dest);
660 }
662 }
663 }
664 }
666 void bli_dmam_wait(obj_t* p, dmam_t* cntl, dmam_thrinfo_t* thread )
667 {
668 if ( cntl_is_noop( cntl ) )
669 {
670 return;
671 }
673 // Wait only if current thread work ID is zero
674 if(thread->work_id == 0)
675 {
676 lib_emt_wait(p->emt_handle);
677 }
678 }
680 void bli_obj_release_emt_handle( obj_t* p)
681 {
682 if ( p->emt_handle != NULL )
683 {
684 bli_dma_channel_release(p->emt_handle, lib_get_coreID());
685 p->emt_handle = NULL;
686 }
687 }
689 void bli_dma_finalize(void)
690 {
691 _Pragma( "omp parallel num_threads(BLIS_MAX_NUM_THREADS)" )
692 {
693 dim_t i;
694 dma_t* dma_pool;
695 lib_emt_Handle* emt_handle;
697 //Create omp threads
698 dma_pool = &dma_pools[lib_get_coreID()];
699 emt_handle = dma_pool->emt_handle;
700 for(i = 0; i < BLIS_C66X_EDMA_MAX_NUM_CHANNELS; i ++)
701 {
702 if( emt_handle[i] != NULL)
703 {
704 if(lib_emt_free( emt_handle[i] ) == LIB_EMT_ERROR_FREE)
705 {
706 printf("ERROR: emt_free\n");
707 }
708 else
709 emt_handle[i] = NULL;
710 }
711 }
712 }
713 }
714 #endif