index ee0a25cafd7661de90513e4c0cff2e90c092eb10..8d23b56e30ceaefa99c3b2d58cc11f52322c58aa 100644 (file)
ctype *cNew0, *cNew1, *cNew2, *cNewTemp; \
\
/*EDMA Declarations */ \
- EdmaMgr_Handle edma_handle_b = NULL; \
- EdmaMgr_Handle edma_handle_c0 = NULL; \
- EdmaMgr_Handle edma_handle_c1 = NULL; \
+ lib_emt_Handle emt_handle_b = NULL; \
+ lib_emt_Handle emt_handle_c0 = NULL; \
+ lib_emt_Handle emt_handle_c1 = NULL; \
+\
+ /*For DSP timing*/ \
+ volatile uint64_t counter_start_ker, counter_start_nr, counter_start_mr; \
+ volatile uint64_t counter_end_ker, counter_end_nr, counter_end_mr; \
+ extern profile_data_t *bli_trmm_profile_data; \
\
/*
Assumptions/assertions:
cNew2 = bli_mem_buffer( &c2_L2_mem ); \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_b), CSL_chipReadDNUM()); \
- if(edma_handle_b == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_b), lib_get_coreID()); \
+ if(emt_handle_b == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle CoreID %d \n", lib_get_coreID()); \
} \
\
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c0), CSL_chipReadDNUM()); \
- if(edma_handle_c0 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c0), lib_get_coreID()); \
+ if(emt_handle_c0 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C0 CoreID %d \n", lib_get_coreID()); \
} \
/*Acquiring an EDMA handle from the pool*/ \
- bli_dma_channel_acquire(&(edma_handle_c1), CSL_chipReadDNUM()); \
- if(edma_handle_c1 == NULL) \
+ bli_dma_channel_acquire(&(emt_handle_c1), lib_get_coreID()); \
+ if(emt_handle_c1 == NULL) \
{ \
- printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", CSL_chipReadDNUM()); \
+ printf("ker_var2 Failed to alloc edma handle for C1 CoreID %d \n", lib_get_coreID()); \
} \
\
n_cur = ( bli_is_not_edge_f( 0, n_iter, n_left ) ? NR : n_left ); \
/* Loop over the n dimension (NR columns at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_nr = lib_clock64(); \
+ } \
/* Transfering MC(=m)xNR*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1, \
+ lib_emt_copy2D2D(emt_handle_c0, c1, \
cNew1, m*sizeof(ctype), \
n_cur, cs_c*sizeof(ctype), cs_c11*sizeof(ctype)); \
} \
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
- EdmaMgr_copy1D1D(edma_handle_b, b1, b1_L1, k_b0111*NR*sizeof(ctype)); \
+ lib_emt_copy1D1D(emt_handle_b, b1, b1_L1, k_b0111*NR*sizeof(ctype)); \
\
- EdmaMgr_wait(edma_handle_c0); \
+ lib_emt_wait(emt_handle_c0); \
if(j < n_iter-1) /* no transfer for last iteration */ \
{ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c0, c1+cstep_c, \
+ lib_emt_copy2D2D(emt_handle_c0, c1+cstep_c, \
cNew0, m*sizeof(ctype), \
n_next, cs_c*sizeof(ctype), \
cs_c11*sizeof(ctype)); \
bli_auxinfo_set_is_b( PACKNR * k_b0111, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1 + ( off_b0111 * PACKMR ) / off_scl, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1 + ( off_b0111 * PACKMR ) / off_scl, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
/*a1_i = a1_L1 + ( off_b0111 * PACKMR ) / off_scl;*/ \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2 + ( off_b0111 * PACKMR ) / off_scl, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2 + ( off_b0111 * PACKMR ) / off_scl, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND],\
+ (counter_end_ker-counter_start_ker), 2*k_b0111*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k_b0111*m*n_cur); \
+ } \
} \
\
b1 += ps_b_cur; \
bli_auxinfo_set_is_b( istep_b, aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_mr = lib_clock64(); \
+ } \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
if(i == 0) \
{ \
- idma1_setup(a2_L1, a1, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a1, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
- while(!idma1_done()){;} \
+ lib_imt_wait(); \
temp = a1_L1; \
a1_L1 = a2_L1; \
a2_L1 = temp; \
if(i == 0) \
{ \
- EdmaMgr_wait(edma_handle_b);\
+ lib_emt_wait(emt_handle_b);\
} \
\
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
else \
{ \
/*Start next panel*/ \
- idma1_setup(a2_L1, a2, k_b0111*MR*sizeof(ctype), 0, 0, 7); \
+ lib_imt_copy(a2, a2_L1, k_b0111*MR*sizeof(ctype)); \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Handle interior and edge cases separately. */ \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_start_ker = lib_clock64(); \
+ } \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
ct, rs_ct, cs_ct, \
c11, rs_c11, cs_c11 ); \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_ker = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_KER_LOOP_IND], \
+ (counter_end_ker-counter_start_ker), 2*k*m_cur*n_cur); \
+ } \
} \
\
a1 += rstep_a; \
c11 += rstep_c11; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_mr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_IR_LOOP_IND], \
+ (counter_end_mr-counter_start_mr), 2*k*m*n_cur); \
+ } \
} \
\
b1 += cstep_b; \
cNew1 = cNewTemp; \
if(j != 0) /* wait for save c to complete; skip first iteration */ \
{ \
- EdmaMgr_wait(edma_handle_c1); \
+ lib_emt_wait(emt_handle_c1); \
} \
/* save updated c*/ \
if (cs_c*sizeof(ctype) < BLIS_C66X_MAXDMASTRIDE) \
{ \
- EdmaMgr_copy2D2DSep(edma_handle_c1, cNew2, c1, m*sizeof(ctype), \
+ lib_emt_copy2D2D(emt_handle_c1, cNew2, c1, m*sizeof(ctype), \
n_cur, cs_c11*sizeof(ctype), cs_c*sizeof(ctype)); \
}\
else \
\
c1 += cstep_c; \
} \
+ if (BLIS_ENABLE_PROFILE_KERVAR2 == 1) \
+ { \
+ counter_end_nr = lib_clock64(); \
+ bli_profile_data_update(bli_trmm_profile_data[bli_get_thread_num()+BLIS_MAX_NUM_THREADS*BLIS_PROFILE_JR_LOOP_IND], \
+ (counter_end_nr-counter_start_nr), 2*k*m*n); \
+ } \
\
bli_mem_release( &c2_L2_mem ); \
bli_mem_release( &c1_L2_mem ); \
bli_mem_release( &a2_L1_mem ); \
bli_mem_release( &a1_L1_mem ); \
bli_mem_release( &b1_L1_mem ); \
- if ( edma_handle_b != NULL ) \
+ if ( emt_handle_b != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_b, CSL_chipReadDNUM()); \
- edma_handle_b = NULL; \
+ bli_dma_channel_release(emt_handle_b, lib_get_coreID()); \
+ emt_handle_b = NULL; \
} \
- if ( edma_handle_c0 != NULL ) \
+ if ( emt_handle_c0 != NULL ) \
{ \
- bli_dma_channel_release(edma_handle_c0, CSL_chipReadDNUM()); \
- edma_handle_c0 = NULL; \
+ bli_dma_channel_release(emt_handle_c0, lib_get_coreID()); \
+ emt_handle_c0 = NULL; \
} \
- if ( edma_handle_c1 != NULL ) \
+ if ( emt_handle_c1 != NULL ) \
{ \
- EdmaMgr_wait(edma_handle_c1); /* wait for save c to complete */ \
- bli_dma_channel_release(edma_handle_c1, CSL_chipReadDNUM()); \
- edma_handle_c1 = NULL; \
+ lib_emt_wait(emt_handle_c1); /* wait for save c to complete */ \
+ bli_dma_channel_release(emt_handle_c1, lib_get_coreID()); \
+ emt_handle_c1 = NULL; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \