summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: 2627312)
raw | patch | inline | side by side (parent: 2627312)
author | Jianzhong Xu <a0869574@ti.com> | |
Thu, 19 May 2016 16:42:23 +0000 (12:42 -0400) | ||
committer | Jianzhong Xu <a0869574@ti.com> | |
Thu, 19 May 2016 16:42:23 +0000 (12:42 -0400) |
17 files changed:
diff --git a/src/ti/linalg/blis/frame/3/gemm/bli_gemm_blk_var1f.c b/src/ti/linalg/blis/frame/3/gemm/bli_gemm_blk_var1f.c
index fd421d8f9dabbaefeef4418b20f0d8995ed5de72..ac0ee053e5d8f01539bfd2faa66441a084365260 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t m_trans;
#ifdef BLIS_ENABLE_C66X_EDMA
gemm_thread_sub_opackm( thread ) );
#endif
- // Query dimension in partitioning direction.
- m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
-// bli_get_range( thread, 0, m_trans,
-// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
-// &start, &end );
-
- bli_get_range_t2b( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
+ bli_get_range_t2b( thread, a,
+ bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the m dimension.
diff --git a/src/ti/linalg/blis/frame/3/gemm/bli_gemm_blk_var2f.c b/src/ti/linalg/blis/frame/3/gemm/bli_gemm_blk_var2f.c
index 8da0f33c5f39ea547c61216f673e20127564270f..c473750baa544d0b5f6516870248e7ba727b3303 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
-// printf("blk var 2\n");
-
- if( thread_am_ochief( thread ) ) {
+ if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ),
gemm_thread_sub_opackm( thread ) );
- // Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
-// bli_get_range( thread, 0, n_trans,
-// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
-// &start, &end );
- bli_get_range_l2r( thread, 0, n_trans,
- bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ bli_get_range_l2r( thread, b,
+ bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
- // NOTE: Use of b (for execution datatype) is intentional!
+ /// bli_get_range_l2r( thread, 0, n_trans,
+ // bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
+ // &start, &end );/ NOTE: Use of b (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, end, b,
diff --git a/src/ti/linalg/blis/frame/3/herk/bli_herk_blk_var1f.c b/src/ti/linalg/blis/frame/3/herk/bli_herk_blk_var1f.c
index 492d24611b353260ada30c50ed39a8760a8aa636..c81198fed8b604a82379f17fc366d4981e001a6b 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t m_trans;
#ifdef BLIS_ENABLE_C66X_EDMA
if( thread_am_ochief( thread ) )
herk_thread_sub_opackm( thread ) );
#endif
// Query dimension in partitioning direction.
- m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
-// bli_get_range_weighted( thread, 0, m_trans,
-// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
-// bli_obj_is_upper( *c ), &start, &end );
-
-#if 0
- if(bli_is_lower( bli_obj_root_uplo( *c ) ))
- {
- dim_t n_trans;
- n_trans = bli_obj_width_after_trans(*c);
-
- {
- dim_t At, Ar, X;
- dim_t num_threads_At, num_threads_Ar;
- At = ceil(n_trans*n_trans/2);
- Ar = bli_max(0, m_trans - n_trans)*n_trans;
- X = ceil(Ar/At);
-
- if (X > 0)
- {
- num_threads_At = thread->n_way / (1 + X);
- num_threads_Ar = thread->n_way - num_threads_At;
-
- printf("n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", n_trans*n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
-
- if(thread->work_id < num_threads_At)
- {
- dim_t all_start = 0;
- dim_t all_end = n_trans;
- dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
- uplo_t uplo = bli_obj_root_uplo( *c );
- bool_t handle_edge_low = FALSE;
- dim_t n_way = num_threads_At;
- dim_t work_id = thread->work_id;
- dim_t size = all_end - all_start; // partioning only the triangular part
- dim_t width;
- dim_t block_fac_leftover = size % block_factor;
- dim_t i;
- double num;
-
- bli_toggle_uplo(uplo);
-
- //printf("Triangle: work_id = %d \n", thread->work_id);
-
- start = 0;
- end = all_end - all_start;
- num = size * size / ( double )n_way;
-
- for ( i = 0; TRUE; ++i )
- {
- width = ceil( sqrt( start * start + num ) ) - start;
-
- if ( i == 0 && handle_edge_low )
- {
- if ( width % block_factor != block_fac_leftover )
- width += block_fac_leftover - ( width % block_factor );
- }
- else
- {
- if ( width % block_factor != 0 )
- width += block_factor - ( width % block_factor );
- }
-
- if ( work_id == 0 )
- {
- start = start + all_start;
- end = bli_min( start + width, all_end );
- break;
- }
- else
- {
- start = start + width;
- work_id--;
- }
- }
-
- }
- else
- {
- dim_t all_start = n_trans;
- dim_t all_end = m_trans;
- dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
- bool_t handle_edge_low = FALSE;
-
- dim_t n_way = num_threads_Ar;
- dim_t work_id = thread->work_id - num_threads_At;
-
- dim_t size = all_end - all_start;
-
- dim_t n_bf_whole = size / block_factor;
- dim_t n_bf_left = size % block_factor;
-
- dim_t n_bf_lo = n_bf_whole / n_way;
- dim_t n_bf_hi = n_bf_whole / n_way;
-
- //printf("Rectangle: work_id = %d \n", thread->work_id);
-
-
- if ( handle_edge_low == FALSE )
- {
- // Notice that if all threads receive the same number of
- // block_factors, those threads are considered "high" and
- // the "low" thread group is empty.
- dim_t n_th_lo = n_bf_whole % n_way;
- //dim_t n_th_hi = n_way - n_th_lo;
-
-
- // If some partitions must have more block_factors than others
- // assign the slightly larger partitions to lower index threads.
- if ( n_th_lo != 0 ) n_bf_lo += 1;
-
- // Compute the actual widths (in units of rows/columns) of
- // individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
-
- // Precompute the starting indices of the low and high groups.
- dim_t lo_start = all_start;
- dim_t hi_start = all_start + n_th_lo * size_lo;
-
- // Compute the start and end of individual threads' ranges
- // as a function of their work_ids and also the group to which
- // they belong (low or high).
- if ( work_id < n_th_lo )
- {
- start = lo_start + (work_id ) * size_lo;
- end = lo_start + (work_id+1) * size_lo;
- }
- else // if ( n_th_lo <= work_id )
- {
- start = hi_start + (work_id-n_th_lo ) * size_hi;
- end = hi_start + (work_id-n_th_lo+1) * size_hi;
-
- // Since the edge case is being allocated to the high
- // end of the index range, we have to advance the last
- // thread's end.
- if ( work_id == n_way - 1 ) end += n_bf_left;
- }
- }
- else // if ( handle_edge_low == TRUE )
- {
- // Notice that if all threads receive the same number of
- // block_factors, those threads are considered "low" and
- // the "high" thread group is empty.
- dim_t n_th_hi = n_bf_whole % n_way;
- dim_t n_th_lo = n_way - n_th_hi;
-
- // If some partitions must have more block_factors than others
- // assign the slightly larger partitions to higher index threads.
- if ( n_th_hi != 0 ) n_bf_hi += 1;
-
- // Compute the actual widths (in units of rows/columns) of
- // individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
-
- // Precompute the starting indices of the low and high groups.
- dim_t lo_start = all_start;
- dim_t hi_start = all_start + n_th_lo * size_lo
- + n_bf_left;
-
- // Compute the start and end of individual threads' ranges
- // as a function of their work_ids and also the group to which
- // they belong (low or high).
- if ( work_id < n_th_lo )
- {
- start = lo_start + (work_id ) * size_lo;
- end = lo_start + (work_id+1) * size_lo;
-
- // Since the edge case is being allocated to the low
- // end of the index range, we have to advance the
- // starts/ends accordingly.
- if ( work_id == 0 ) end += n_bf_left;
- else { start += n_bf_left;
- end += n_bf_left; }
- }
- else // if ( n_th_lo <= work_id )
- {
- start = hi_start + (work_id-n_th_lo ) * size_hi;
- end = hi_start + (work_id-n_th_lo+1) * size_hi;
- }
- }
- }
- }
- else
- {
- bli_get_range_weighted_t2b( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *c ), &start, &end );
- }
- }
- }
- else
- bli_get_range_weighted_t2b( thread, 0, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *c ), &start, &end );
-#else
- bli_get_range_weighted_t2b( thread, 0, m_trans,
+ bli_get_range_weighted_t2b( thread, c,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *c ), &start, &end );
-#endif
+ &start, &end );
#ifdef BLIS_ENABLE_C66X_EDMA
if(start < end)
diff --git a/src/ti/linalg/blis/frame/3/herk/bli_herk_blk_var2f.c b/src/ti/linalg/blis/frame/3/herk/bli_herk_blk_var2f.c
index 35737049cda564aed09ffbc59e2e95e122d95cc5..dfb36e38490b24057600b050a067df5efb79063c 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
subpart_t stored_part;
// The upper and lower variants are identical, except for which
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *c );
dim_t start, end;
- // Needs to be replaced with a weighted range because triangle
-// bli_get_range_weighted( thread, 0, n_trans,
-// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
-// bli_obj_is_lower( *c ), &start, &end );
-
- bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_get_range_weighted_l2r( thread, c,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *c ), &start, &end );
+ &start, &end );
// Partition along the n dimension.
diff --git a/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var1f.c b/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var1f.c
index 10033137c04be3d6304bbe9db0793046de6a9a2c..0f596a3980b971f6aa566ddcb304b98184a7a84b 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t m_trans;
- dim_t offA;
#ifdef BLIS_ENABLE_C66X_EDMA
obj_t c2;
cntl_sub_packm_b( cntl ),
trmm_thread_sub_opackm( thread ) );
#endif
- // Set the default length of and offset to the non-zero part of A.
- m_trans = bli_obj_length_after_trans( *a );
- offA = 0;
-
- // If A is lower triangular, we have to adjust where the non-zero part of
- // A begins. If A is upper triangular, we have to adjust the length of
- // the non-zero part. If A is general/dense, then we keep the defaults.
- if ( bli_obj_is_lower( *a ) )
- offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
- else if ( bli_obj_is_upper( *a ) )
- m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
- bli_obj_width_after_trans( *a );
dim_t start, end;
-// bli_get_range_weighted( thread, offA, m_trans,
-// bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
-// bli_obj_is_upper( *c ), &start, &end );
+ siz_t area;
-#if 0
- dim_t n_trans;
- n_trans = bli_obj_width_after_trans(*a);
-
- {
- dim_t At, Ar, X;
- dim_t num_threads_At, num_threads_Ar;
- At = ceil(n_trans*n_trans/2);
- Ar = bli_max(0, m_trans - n_trans)*n_trans;
- X = ceil(Ar/At);
-
- printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d\n", offA, m_trans, n_trans, At, Ar, X);
-
- if (X > 0)
- {
- num_threads_At = thread->n_way / (1 + X);
- num_threads_Ar = thread->n_way - num_threads_At;
-
- printf("offA = %d, m_trans = %d, n_trans = %d At = %d, Ar = %d, X = %d, num_threads_At = %d, num_threads_Ar = %d\n", offA, m_trans, n_trans, At, Ar, X, num_threads_At, num_threads_Ar);
-
- if(thread->work_id < num_threads_At)
- {
- dim_t all_start = offA;
- dim_t all_end = offA + n_trans;
- dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
- uplo_t uplo = bli_obj_root_uplo( *a );
- bool_t handle_edge_low = FALSE;
- dim_t n_way = num_threads_At;
- dim_t work_id = thread->work_id;
- dim_t size = all_end - all_start; // partioning only the triangular part
- dim_t width;
- dim_t block_fac_leftover = size % block_factor;
- dim_t i;
- double num;
-
- bli_toggle_uplo(uplo);
-
- //printf("Triangle: work_id = %d \n", thread->work_id);
-
- start = 0;
- end = all_end - all_start;
- num = size * size / ( double )n_way;
-
- for ( i = 0; TRUE; ++i )
- {
- width = ceil( sqrt( start * start + num ) ) - start;
-
- if ( i == 0 && handle_edge_low )
- {
- if ( width % block_factor != block_fac_leftover )
- width += block_fac_leftover - ( width % block_factor );
- }
- else
- {
- if ( width % block_factor != 0 )
- width += block_factor - ( width % block_factor );
- }
-
- if ( work_id == 0 )
- {
- start = start + all_start;
- end = bli_min( start + width, all_end );
- break;
- }
- else
- {
- start = start + width;
- work_id--;
- }
- }
-
- }
- else
- {
- dim_t all_start = offA + n_trans;
- dim_t all_end = m_trans;
- dim_t block_factor = bli_determine_reg_blocksize( a, cntl_blocksize( cntl ));
- bool_t handle_edge_low = FALSE;
-
- dim_t n_way = num_threads_Ar;
- dim_t work_id = thread->work_id - num_threads_At;
-
- dim_t size = all_end - all_start;
-
- dim_t n_bf_whole = size / block_factor;
- dim_t n_bf_left = size % block_factor;
-
- dim_t n_bf_lo = n_bf_whole / n_way;
- dim_t n_bf_hi = n_bf_whole / n_way;
-
- //printf("Rectangle: work_id = %d \n", thread->work_id);
-
-
- if ( handle_edge_low == FALSE )
- {
- // Notice that if all threads receive the same number of
- // block_factors, those threads are considered "high" and
- // the "low" thread group is empty.
- dim_t n_th_lo = n_bf_whole % n_way;
- //dim_t n_th_hi = n_way - n_th_lo;
-
-
- // If some partitions must have more block_factors than others
- // assign the slightly larger partitions to lower index threads.
- if ( n_th_lo != 0 ) n_bf_lo += 1;
-
- // Compute the actual widths (in units of rows/columns) of
- // individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
-
- // Precompute the starting indices of the low and high groups.
- dim_t lo_start = all_start;
- dim_t hi_start = all_start + n_th_lo * size_lo;
-
- // Compute the start and end of individual threads' ranges
- // as a function of their work_ids and also the group to which
- // they belong (low or high).
- if ( work_id < n_th_lo )
- {
- start = lo_start + (work_id ) * size_lo;
- end = lo_start + (work_id+1) * size_lo;
- }
- else // if ( n_th_lo <= work_id )
- {
- start = hi_start + (work_id-n_th_lo ) * size_hi;
- end = hi_start + (work_id-n_th_lo+1) * size_hi;
-
- // Since the edge case is being allocated to the high
- // end of the index range, we have to advance the last
- // thread's end.
- if ( work_id == n_way - 1 ) end += n_bf_left;
- }
- }
- else // if ( handle_edge_low == TRUE )
- {
- // Notice that if all threads receive the same number of
- // block_factors, those threads are considered "low" and
- // the "high" thread group is empty.
- dim_t n_th_hi = n_bf_whole % n_way;
- dim_t n_th_lo = n_way - n_th_hi;
-
- // If some partitions must have more block_factors than others
- // assign the slightly larger partitions to higher index threads.
- if ( n_th_hi != 0 ) n_bf_hi += 1;
-
- // Compute the actual widths (in units of rows/columns) of
- // individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
-
- // Precompute the starting indices of the low and high groups.
- dim_t lo_start = all_start;
- dim_t hi_start = all_start + n_th_lo * size_lo
- + n_bf_left;
-
- // Compute the start and end of individual threads' ranges
- // as a function of their work_ids and also the group to which
- // they belong (low or high).
- if ( work_id < n_th_lo )
- {
- start = lo_start + (work_id ) * size_lo;
- end = lo_start + (work_id+1) * size_lo;
-
- // Since the edge case is being allocated to the low
- // end of the index range, we have to advance the
- // starts/ends accordingly.
- if ( work_id == 0 ) end += n_bf_left;
- else { start += n_bf_left;
- end += n_bf_left; }
- }
- else // if ( n_th_lo <= work_id )
- {
- start = hi_start + (work_id-n_th_lo ) * size_hi;
- end = hi_start + (work_id-n_th_lo+1) * size_hi;
- }
- }
- }
- }
- else
- {
- bli_get_range_weighted_t2b( thread, offA, m_trans,
- bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *a ), &start, &end );
- }
- }
-#else
- bli_get_range_weighted_t2b( thread, offA, m_trans,
+ area = bli_get_range_weighted_t2b( thread, a,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *a ), &start, &end );
-#endif
-
+ &start, &end );
#ifdef BLIS_ENABLE_C66X_EDMA
if(start<end)
diff --git a/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var2b.c b/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var2b.c
index e4e696b93d6ad1edc6751df54e91bad34478bc18..05865196bb2b961369a608f6bc91bc0e42e34ae1 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
-
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
-// bli_get_range_weighted( thread, 0, n_trans,
-// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
-// bli_obj_is_upper( *c ), &start, &end );
- bli_get_range_weighted_r2l( thread, 0, n_trans,
+ bli_get_range_weighted_r2l( thread, b,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *b ), &start, &end );
+ &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
diff --git a/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var2f.c b/src/ti/linalg/blis/frame/3/trmm/bli_trmm_blk_var2f.c
index 6ddd1dc543a181ff5f927bbc688e83e385b9d192..2ed2d4ccaa22583ec5c652c7c19bbfe852a92a05 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
-
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
- // Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
-// bli_get_range_weighted( thread, 0, n_trans,
-// bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
-// bli_obj_is_lower( *c ), &start, &end );
-
- bli_get_range_weighted_l2r( thread, 0, n_trans,
+ bli_get_range_weighted_l2r( thread, b,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
- bli_obj_root_uplo( *b ), &start, &end );
+ &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
diff --git a/src/ti/linalg/blis/frame/3/trmm/bli_trmm_front.c b/src/ti/linalg/blis/frame/3/trmm/bli_trmm_front.c
index 39a9815954edab4255f95464dc76ced443aa55cc..9c010f2973f81020b39dc2b88c9ccbf26a7e5281 100644 (file)
#endif
// Check parameters.
+
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b );
diff --git a/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var1b.c b/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var1b.c
index c85eee256b55d0790568057449210597912bcc2a..c73ab282dd4e135b4e29c50dff8de297a2ff6dba 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t m_trans;
- dim_t offA;
+
#ifdef BLIS_ENABLE_C66X_EDMA
dim_t b_alg_next;
#endif
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
- // Set the default length of and offset to the non-zero part of A.
- m_trans = bli_obj_length_after_trans( *a );
- offA = 0;
-
- // If A is upper triangular, we have to adjust where the non-zero part of
- // A begins.
- if ( bli_obj_is_upper( *a ) )
- offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) -
- bli_obj_width_after_trans( *a );
-
dim_t start, end;
num_t dt = bli_obj_execution_datatype( *a );
-// bli_get_range( thread, offA, m_trans,
-// //bli_lcm( bli_info_get_default_nr( datatype ), bli_info_get_default_mr( datatype ) ),
-// bli_info_get_default_mc( datatype ),
-// &start, &end );
-
- bli_get_range_b2t( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
- bli_info_get_default_mc( dt ),
+ dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
+ bli_info_get_default_mr( dt ) :
+ bli_info_get_default_nr( dt ) );
+
+ bli_get_range_b2t( thread, a, bf,
&start, &end );
// Partition along the remaining portion of the m dimension.
diff --git a/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var1f.c b/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var1f.c
index 8c7d159580568e9724d04c33c06cec8b772c8406..9c42c2cc371c9659d9c5eb4dfd3942663124bd80 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t m_trans;
- dim_t offA;
+
#ifdef BLIS_ENABLE_C66X_EDMA
dim_t b_alg_next;
#endif
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
- // Set the default length of and offset to the non-zero part of A.
- m_trans = bli_obj_length_after_trans( *a );
- offA = 0;
-
- // If A is lower triangular, we have to adjust where the non-zero part of
- // A begins.
- if ( bli_obj_is_lower( *a ) )
- offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
-
dim_t start, end;
num_t dt = bli_obj_execution_datatype( *a );
- bli_get_range_t2b( thread, offA, m_trans,
- //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
- bli_info_get_default_mc( dt ),
+ dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
+ bli_info_get_default_mr( dt ) :
+ bli_info_get_default_nr( dt ) );
+ bli_get_range_t2b( thread, a, bf,
&start, &end );
// Partition along the remaining portion of the m dimension.
diff --git a/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var2b.c b/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var2b.c
index 69550811b9689f2aaf3176f04fb18651b4f9db21..4420608bed90fd8ea45d9807edb441519f566075 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
- // Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
num_t dt = bli_obj_execution_datatype( *a );
+ dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
+ bli_info_get_default_mr( dt ) :
+ bli_info_get_default_nr( dt ) );
- bli_get_range_r2l( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
- &start, &end );
+ bli_get_range_r2l( thread, b, bf,
+ &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
diff --git a/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var2f.c b/src/ti/linalg/blis/frame/3/trsm/bli_trsm_blk_var2f.c
index 78ed4f6f80034f40fa5344e41e04b290f987105f..0ec22d2aea4da64273051ebe1bca0bc33a355e6b 100644 (file)
dim_t i;
dim_t b_alg;
- dim_t n_trans;
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
- n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
num_t dt = bli_obj_execution_datatype( *a );
-// bli_get_range_l2r( thread, 0, n_trans,
-// //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
-// // bli_info_get_default_mr( BLIS_TRSM, dt ) ),
-// bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
-// bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
-// &start, &end );
-
- bli_get_range_l2r( thread, 0, n_trans,
- bli_lcm( bli_info_get_default_nr( dt ), bli_info_get_default_mr( dt ) ),
+ dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
+ bli_info_get_default_mr( dt ) :
+ bli_info_get_default_nr( dt ) );
+
+ bli_get_range_l2r( thread, b, bf,
&start, &end );
//printf("blk_var2f n = %d end = %d\n", n_trans, end);
diff --git a/src/ti/linalg/blis/frame/base/bli_threading.c b/src/ti/linalg/blis/frame/base/bli_threading.c
index bd753f808c3197dfc2e1e1624928023846632d22..cc58f9bcf33a4ef47fec01a6cc1a1e619eb10b32 100644 (file)
@@ -341,16 +341,150 @@ void bli_setup_thread_info( thrinfo_t* thr, thread_comm_t* ocomm, dim_t ocomm_id
//}
-void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
+//void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
+//{
+// thrinfo_t* thread = ( thrinfo_t* )thr;
+// dim_t n_way = thread->n_way;
+// dim_t work_id = thread->work_id;
+//
+// dim_t size = all_end - all_start;
+//
+// dim_t n_bf_whole = size / block_factor;
+// dim_t n_bf_left = size % block_factor;
+//
+// dim_t n_bf_lo = n_bf_whole / n_way;
+// dim_t n_bf_hi = n_bf_whole / n_way;
+//
+// // In this function, we partition the space between all_start and
+// // all_end into n_way partitions, each a multiple of block_factor
+// // with the exception of the one partition that recieves the
+// // "edge" case (if applicable).
+// //
+// // Here are examples of various thread partitionings, in units of
+// // the block_factor, when n_way = 4. (A '+' indicates the thread
+// // that receives the leftover edge case (ie: n_bf_left extra
+// // rows/columns in its sub-range).
+// // (all_start ... all_end)
+// // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3
+// // 12 =0 f 0 4 3 3 3 3
+// // 12 >0 f 0 4 3 3 3 3+
+// // 13 >0 f 1 3 4 3 3 3+
+// // 14 >0 f 2 2 4 4 3 3+
+// // 15 >0 f 3 1 4 4 4 3+
+// // 15 =0 f 3 1 4 4 4 3
+// //
+// // 12 =0 t 4 0 3 3 3 3
+// // 12 >0 t 4 0 3+ 3 3 3
+// // 13 >0 t 3 1 3+ 3 3 4
+// // 14 >0 t 2 2 3+ 3 4 4
+// // 15 >0 t 1 3 3+ 4 4 4
+// // 15 =0 t 1 3 3 4 4 4
+//
+// // As indicated by the table above, load is balanced as equally
+// // as possible, even in the presence of an edge case.
+//
+// // First, we must differentiate between cases where the leftover
+// // "edge" case (n_bf_left) should be allocated to a thread partition
+// // at the low end of the index range or the high end.
+//
+// if ( handle_edge_low == FALSE )
+// {
+// // Notice that if all threads receive the same number of
+// // block_factors, those threads are considered "high" and
+// // the "low" thread group is empty.
+// dim_t n_th_lo = n_bf_whole % n_way;
+// //dim_t n_th_hi = n_way - n_th_lo;
+//
+// // If some partitions must have more block_factors than others
+// // assign the slightly larger partitions to lower index threads.
+// if ( n_th_lo != 0 ) n_bf_lo += 1;
+//
+// // Compute the actual widths (in units of rows/columns) of
+// // individual threads in the low and high groups.
+// dim_t size_lo = n_bf_lo * block_factor;
+// dim_t size_hi = n_bf_hi * block_factor;
+//
+// // Precompute the starting indices of the low and high groups.
+// dim_t lo_start = all_start;
+// dim_t hi_start = all_start + n_th_lo * size_lo;
+//
+// // Compute the start and end of individual threads' ranges
+// // as a function of their work_ids and also the group to which
+// // they belong (low or high).
+// if ( work_id < n_th_lo )
+// {
+// *start = lo_start + (work_id ) * size_lo;
+// *end = lo_start + (work_id+1) * size_lo;
+// }
+// else // if ( n_th_lo <= work_id )
+// {
+// *start = hi_start + (work_id-n_th_lo ) * size_hi;
+// *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+//
+// // Since the edge case is being allocated to the high
+// // end of the index range, we have to advance the last
+// // thread's end.
+// if ( work_id == n_way - 1 ) *end += n_bf_left;
+// }
+// }
+// else // if ( handle_edge_low == TRUE )
+// {
+// // Notice that if all threads receive the same number of
+// // block_factors, those threads are considered "low" and
+// // the "high" thread group is empty.
+// dim_t n_th_hi = n_bf_whole % n_way;
+// dim_t n_th_lo = n_way - n_th_hi;
+//
+// // If some partitions must have more block_factors than others
+// // assign the slightly larger partitions to higher index threads.
+// if ( n_th_hi != 0 ) n_bf_hi += 1;
+//
+// // Compute the actual widths (in units of rows/columns) of
+// // individual threads in the low and high groups.
+// dim_t size_lo = n_bf_lo * block_factor;
+// dim_t size_hi = n_bf_hi * block_factor;
+//
+// // Precompute the starting indices of the low and high groups.
+// dim_t lo_start = all_start;
+// dim_t hi_start = all_start + n_th_lo * size_lo
+// + n_bf_left;
+//
+// // Compute the start and end of individual threads' ranges
+// // as a function of their work_ids and also the group to which
+// // they belong (low or high).
+// if ( work_id < n_th_lo )
+// {
+// *start = lo_start + (work_id ) * size_lo;
+// *end = lo_start + (work_id+1) * size_lo;
+//
+// // Since the edge case is being allocated to the low
+// // end of the index range, we have to advance the
+// // starts/ends accordingly.
+// if ( work_id == 0 ) *end += n_bf_left;
+// else { *start += n_bf_left;
+// *end += n_bf_left; }
+// }
+// else // if ( n_th_lo <= work_id )
+// {
+// *start = hi_start + (work_id-n_th_lo ) * size_hi;
+// *end = hi_start + (work_id-n_th_lo+1) * size_hi;
+// }
+// }
+//}
+
+void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* start, dim_t* end )
{
thrinfo_t* thread = ( thrinfo_t* )thr;
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
+ dim_t all_start = 0;
+ dim_t all_end = n;
+
dim_t size = all_end - all_start;
- dim_t n_bf_whole = size / block_factor;
- dim_t n_bf_left = size % block_factor;
+ dim_t n_bf_whole = size / bf;
+ dim_t n_bf_left = size % bf;
dim_t n_bf_lo = n_bf_whole / n_way;
dim_t n_bf_hi = n_bf_whole / n_way;
@@ -401,8 +535,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
// Compute the actual widths (in units of rows/columns) of
// individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
+ dim_t size_lo = n_bf_lo * bf;
+ dim_t size_hi = n_bf_hi * bf;
// Precompute the starting indices of the low and high groups.
dim_t lo_start = all_start;
@@ -441,8 +575,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
// Compute the actual widths (in units of rows/columns) of
// individual threads in the low and high groups.
- dim_t size_lo = n_bf_lo * block_factor;
- dim_t size_hi = n_bf_hi * block_factor;
+ dim_t size_lo = n_bf_lo * bf;
+ dim_t size_hi = n_bf_hi * bf;
// Precompute the starting indices of the low and high groups.
dim_t lo_start = all_start;
@@ -472,189 +606,528 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
}
}
-void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- bli_get_range( thr, all_start, all_end, block_factor,
+// bli_get_range( thr, all_start, all_end, block_factor,
+// FALSE, start, end );
+ dim_t m = bli_obj_length_after_trans( *a );
+ dim_t n = bli_obj_width_after_trans( *a );
+
+ bli_get_range( thr, n, bf,
FALSE, start, end );
+
+ return m * ( *end - *start );
}
-void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- bli_get_range( thr, all_start, all_end, block_factor,
+ dim_t m = bli_obj_length_after_trans( *a );
+ dim_t n = bli_obj_width_after_trans( *a );
+
+ bli_get_range( thr, n, bf,
TRUE, start, end );
+
+ return m * ( *end - *start );
}
-void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- bli_get_range( thr, all_start, all_end, block_factor,
+ dim_t m = bli_obj_length_after_trans( *a );
+ dim_t n = bli_obj_width_after_trans( *a );
+
+ bli_get_range( thr, m, bf,
FALSE, start, end );
+
+ return n * ( *end - *start );
}
-void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- bli_get_range( thr, all_start, all_end, block_factor,
+ dim_t m = bli_obj_length_after_trans( *a );
+ dim_t n = bli_obj_width_after_trans( *a );
+
+ bli_get_range( thr, m, bf,
TRUE, start, end );
+
+ return n * ( *end - *start );
}
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
+dim_t bli_get_range_width_l( doff_t diagoff_j,
+ dim_t m,
+ dim_t n_j,
+ dim_t j,
+ dim_t n_way,
+ dim_t bf,
+ dim_t bf_left,
+ double area_per_thr,
+ bool_t handle_edge_low )
{
- thrinfo_t* thread = ( thrinfo_t* )thr;
- dim_t n_way = thread->n_way;
- dim_t work_id = thread->work_id;
- dim_t size = all_end - all_start;
- dim_t width;
- dim_t block_fac_leftover = size % block_factor;
- dim_t i;
- double num;
+ dim_t width;
+
+ // In this function, we assume that we are somewhere in the process of
+ // partitioning an m x n lower-stored region (with arbitrary diagonal
+ // offset) n_ways along the n dimension (into column panels). The value
+ // j identifies the left-to-right subpartition index (from 0 to n_way-1)
+ // of the subpartition whose width we are about to compute using the
+ // area per thread determined by the caller. n_j is the number of
+ // columns in the remaining region of the matrix being partitioned,
+ // and diagoff_j is that region's diagonal offset.
+
+ // If this is the last subpartition, the width is simply equal to n_j.
+ // Note that this statement handles cases where the "edge case" (if
+ // one exists) is assigned to the high end of the index range (ie:
+ // handle_edge_low == FALSE).
+ if ( j == n_way - 1 ) return n_j;
+
+ // At this point, we know there are at least two subpartitions left.
+ // We also know that IF the submatrix contains a completely dense
+ // rectangular submatrix, it will occur BEFORE the triangular (or
+ // trapezoidal) part.
+
+ // Here, we implement a somewhat minor load balancing optimization
+ // that ends up getting employed only for relatively small matrices.
+ // First, recall that all subpartition widths will be some multiple
+ // of the blocking factor bf, except perhaps either the first or last
+ // subpartition, which will receive the edge case, if it exists.
+ // Also recall that j represents the current thread (or thread group,
+ // or "caucus") for which we are computing a subpartition width.
+ // If n_j is sufficiently small that we can only allocate bf columns
+ // to each of the remaining threads, then we set the width to bf. We
+ // do not allow the subpartition width to be less than bf, so, under
+ // some conditions, if n_j is small enough, some of the reamining
+ // threads may not get any work. For the purposes of this lower bound
+ // on work (ie: width >= bf), we allow the edge case to count as a
+ // "full" set of bf columns.
+ {
+ dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 );
- *start = 0;
- *end = all_end - all_start;
- num = size * size / ( double )n_way;
+ if ( n_j_bf <= n_way - j )
+ {
+ if ( j == 0 && handle_edge_low )
+ width = ( bf_left > 0 ? bf_left : bf );
+ else
+ width = bf;
- if ( bli_is_lower( uplo ) )
+
+ // Make sure that the width does not exceed n_j. This would
+ // occur if and when n_j_bf < n_way - j; that is, when the
+ // matrix being partitioned is sufficiently small relative to
+ // n_way such that there is not even enough work for every
+ // (remaining) thread to get bf (or bf_left) columns. The
+ // net effect of this safeguard is that some threads may get
+ // assigned empty ranges (ie: no work), which of course must
+ // happen in some situations.
+ if ( width > n_j ) width = n_j;
+
+ return width;
+ }
+ }
+
+ // This block computes the width assuming that we are entirely within
+ // a dense rectangle that precedes the triangular (or trapezoidal)
+ // part.
{
- dim_t cur_caucus = n_way - 1;
- dim_t len = 0;
-
- // This loop computes subpartitions backwards, from the high end
- // of the index range to the low end. If the low end is assumed
- // to be on the left and the high end the right, this assignment
- // of widths is appropriate for n dimension partitioning of a
- // lower triangular matrix.
- for ( i = 0; TRUE; ++i )
+ // First compute the width of the current panel under the
+ // assumption that the diagonal offset would not intersect.
+ width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m );
+
+ // Adjust the width, if necessary. Specifically, we may need
+ // to allocate the edge case to the first subpartition, if
+ // requested; otherwise, we just need to ensure that the
+ // subpartition is a multiple of the blocking factor.
+ if ( j == 0 && handle_edge_low )
{
- width = ceil( sqrt( len*len + num ) ) - len;
-
- // If we need to allocate the edge case (assuming it exists)
- // to the high thread subpartition, adjust width so that it
- // contains the exact amount of leftover edge dimension so that
- // all remaining subpartitions can be multiples of block_factor.
- // If the edge case is to be allocated to the low subpartition,
- // or if there is no edge case, it is implicitly allocated to
- // the low subpartition by virtue of the fact that all other
- // subpartitions already assigned will be multiples of
- // block_factor.
- if ( i == 0 && !handle_edge_low )
- {
- if ( width % block_factor != block_fac_leftover )
- width += block_fac_leftover - ( width % block_factor );
- }
- else
- {
- if ( width % block_factor != 0 )
- width += block_factor - ( width % block_factor );
- }
+ if ( width % bf != bf_left ) width += bf_left - ( width % bf );
+ }
+ else // if interior case
+ {
+ // Round up to the next multiple of the blocking factor.
+ //if ( width % bf != 0 ) width += bf - ( width % bf );
+ // Round to the nearest multiple of the blocking factor.
+ if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
- if ( cur_caucus == work_id )
- {
- *start = bli_max( 0, *end - width ) + all_start;
- *end = *end + all_start;
- return;
- }
- else
- {
- *end -= width;
- len += width;
- cur_caucus--;
- }
}
}
- else // if ( bli_is_upper( uplo ) )
+
+ // We need to recompute width if the panel, according to the width
+ // as currently computed, would intersect the diagonal.
+ if ( diagoff_j < width )
{
- // This loop computes subpartitions forwards, from the low end
- // of the index range to the high end. If the low end is assumed
- // to be on the left and the high end the right, this assignment
- // of widths is appropriate for n dimension partitioning of an
- // upper triangular matrix.
- for ( i = 0; TRUE; ++i )
+ dim_t offm_inc, offn_inc;
+
+ // Prune away the unstored region above the diagonal, if it exists.
+ // Note that the entire region was pruned initially, so we know that
+ // we don't need to try to prune the right side. (Also, we discard
+ // the offset deltas since we don't need to actually index into the
+ // subpartition.)
+ bli_prune_unstored_region_top_l( diagoff_j, m, n_j, offm_inc );
+ bli_prune_unstored_region_right_l( diagoff_j, m, n_j, offn_inc );
+
+ // We don't need offm_inc, offn_inc here. These statements should
+ // prevent compiler warnings.
+ ( void )offm_inc;
+ ( void )offn_inc;
+
+ // Prepare to solve a quadratic equation to find the width of the
+ // current (jth) subpartition given the m dimension, diagonal offset,
+ // and area.
+ // NOTE: We know that the +/- in the quadratic formula must be a +
+ // here because we know that the desired solution (the subpartition
+ // width) will be smaller than (m + diagoff), not larger. If you
+ // don't believe me, draw a picture!
+ const double a = -0.5;
+ const double b = ( double )m + ( double )diagoff_j + 0.5;
+ const double c = -0.5 * ( ( double )diagoff_j *
+ ( ( double )diagoff_j + 1.0 )
+ ) - area_per_thr;
+ const double r = b * b - 4.0 * a * c;
+
+ // If the quadratic solution is not imaginary, round it and use that
+ // as our width, but make sure it didn't round to zero. Otherwise,
+ // discard the quadratic solution and leave width, as previously
+ // computed, unchanged.
+ if ( r >= 0.0 )
{
- width = ceil( sqrt( *start * *start + num ) ) - *start;
+ const double x = ( -b + sqrt( r ) ) / ( 2.0 * a );
+ width = ( dim_t )bli_round( x );
+ if ( width == 0 ) width = 1;
+ }
- if ( i == 0 && handle_edge_low )
- {
- if ( width % block_factor != block_fac_leftover )
- width += block_fac_leftover - ( width % block_factor );
- }
- else
- {
- if ( width % block_factor != 0 )
- width += block_factor - ( width % block_factor );
- }
+ // Adjust the width, if necessary.
+ if ( j == 0 && handle_edge_low )
+ {
+ if ( width % bf != bf_left ) width += bf_left - ( width % bf );
+ }
+ else // if interior case
+ {
+ // Round up to the next multiple of the blocking factor.
+ //if ( width % bf != 0 ) width += bf - ( width % bf );
+ // Round to the nearest multiple of the blocking factor.
+ if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
+ }
+ }
- if ( work_id == 0 )
- {
- *start = *start + all_start;
- *end = bli_min( *start + width, all_end );
- return;
- }
- else
+ // Make sure that the width, after being adjusted, does not cause the
+ // subpartition to exceed n_j.
+ if ( width > n_j ) width = n_j;
+
+ return width;
+}
+
+siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff )
+{
+ dim_t offm_inc = 0;
+ dim_t offn_inc = 0;
+ double tri_area;
+ double area;
+
+ // Prune away any rectangular region above where the diagonal
+ // intersects the left edge of the subpartition, if it exists.
+ bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
+
+ // Prune away any rectangular region to the right of where the
+ // diagonal intersects the bottom edge of the subpartition, if
+ // it exists. (This shouldn't ever be needed, since the caller
+ // would presumably have already performed rightward pruning,
+ // but it's here just in case.)
+ bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
+
+ ( void )offm_inc;
+ ( void )offn_inc;
+
+ // Compute the area of the empty triangle so we can subtract it
+ // from the area of the rectangle that bounds the subpartition.
+ if ( bli_intersects_diag_n( diagoff, m, n ) )
+ {
+ double tri_dim = ( double )( n - diagoff - 1 );
+ tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
+ }
+ else
+ {
+ // If the diagonal does not intersect the trapezoid, then
+ // we can compute the area as a simple rectangle.
+ tri_area = 0.0;
+ }
+
+ area = ( double )m * ( double )n - tri_area;
+
+ return ( siz_t )area;
+}
+
+siz_t bli_get_range_weighted( void* thr,
+ doff_t diagoff,
+ uplo_t uplo,
+ dim_t m,
+ dim_t n,
+ dim_t bf,
+ bool_t handle_edge_low,
+ dim_t* j_start_thr,
+ dim_t* j_end_thr )
+{
+ thrinfo_t* thread = ( thrinfo_t* )thr;
+
+ dim_t n_way = thread->n_way;
+ dim_t my_id = thread->work_id;
+
+ dim_t bf_left = n % bf;
+
+ dim_t j;
+
+ dim_t off_j;
+ doff_t diagoff_j;
+ dim_t n_left;
+
+ dim_t width_j;
+
+ dim_t offm_inc, offn_inc;
+
+ double tri_dim, tri_area;
+ double area_total, area_per_thr;
+
+ siz_t area = 0;
+
+ // In this function, we assume that the caller has already determined
+ // that (a) the diagonal intersects the submatrix, and (b) the submatrix
+ // is either lower- or upper-stored.
+
+ if ( bli_is_lower( uplo ) )
+ {
+ // Prune away the unstored region above the diagonal, if it exists,
+ // and then to the right of where the diagonal intersects the bottom,
+ // if it exists. (Also, we discard the offset deltas since we don't
+ // need to actually index into the subpartition.)
+ bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
+ bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
+
+ // We don't need offm_inc, offn_inc here. These statements should
+ // prevent compiler warnings.
+ ( void )offm_inc;
+ ( void )offn_inc;
+
+ // Now that pruning has taken place, we know that diagoff >= 0.
+
+ // Compute the total area of the submatrix, accounting for the
+ // location of the diagonal, and divide it by the number of ways
+ // of parallelism.
+ tri_dim = ( double )( n - diagoff - 1 );
+ tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
+ area_total = ( double )m * ( double )n - tri_area;
+ area_per_thr = area_total / ( double )n_way;
+
+ // Initialize some variables prior to the loop: the offset to the
+ // current subpartition, the remainder of the n dimension, and
+ // the diagonal offset of the current subpartition.
+ off_j = 0;
+ diagoff_j = diagoff;
+ n_left = n;
+
+ // Iterate over the subpartition indices corresponding to each
+ // thread/caucus participating in the n_way parallelism.
+ for ( j = 0; j < n_way; ++j )
+ {
+ // Compute the width of the jth subpartition, taking the
+ // current diagonal offset into account, if needed.
+
+ width_j = bli_get_range_width_l( diagoff_j, m, n_left,
+ j, n_way,
+ bf, bf_left,
+ area_per_thr,
+ handle_edge_low );
+
+ // If the current thread belongs to caucus j, this is his
+ // subpartition. So we compute the implied index range and
+ // end our search.
+ if ( j == my_id )
{
- *start = *start + width;
- work_id--;
+ *j_start_thr = off_j;
+ *j_end_thr = off_j + width_j;
+
+ area = bli_find_area_trap_l( m, width_j, diagoff_j );
+
+ break;
}
+
+ // Shift the current subpartition's starting and diagonal offsets,
+ // as well as the remainder of the n dimension, according to the
+ // computed width, and then iterate to the next subpartition.
+ off_j += width_j;
+ diagoff_j -= width_j;
+ n_left -= width_j;
}
}
+ else // if ( bli_is_upper( uplo ) )
+ {
+ // Express the upper-stored case in terms of the lower-stored case.
+
+ // First, we convert the upper-stored trapezoid to an equivalent
+ // lower-stored trapezoid by rotating it 180 degrees.
+ bli_rotate180_trapezoid( diagoff, uplo );
+
+ // Now that the trapezoid is "flipped" in the n dimension, negate
+ // the bool that encodes whether to handle the edge case at the
+ // low (or high) end of the index range.
+ bli_toggle_bool( handle_edge_low );
+
+ // Compute the appropriate range for the rotated trapezoid.
+ area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
+ handle_edge_low,
+ j_start_thr, j_end_thr );
+
+ // Reverse the indexing basis for the subpartition ranges so that
+ // the indices, relative to left-to-right iteration through the
+ // unrotated upper-stored trapezoid, map to the correct columns
+ // (relative to the diagonal). This amounts to subtracting the
+ // range from n.
+ bli_reverse_index_direction( *j_start_thr, *j_end_thr, n );
+ }
+
+ return area;
}
-void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- if ( bli_is_upper_or_lower( uplo ) )
+ siz_t area;
+
+ // This function assigns area-weighted ranges in the n dimension
+ // where the total range spans 0 to n-1 with 0 at the left end and
+ // n-1 at the right end.
+
+ if ( bli_obj_intersects_diag( *a ) &&
+ bli_obj_is_upper_or_lower( *a ) )
{
- bli_get_range_weighted( thr, all_start, all_end, block_factor,
- uplo, FALSE, start, end );
+ doff_t diagoff = bli_obj_diag_offset( *a );
+ uplo_t uplo = bli_obj_uplo( *a );
+ dim_t m = bli_obj_length( *a );
+ dim_t n = bli_obj_width( *a );
+
+ // Support implicit transposition.
+ if ( bli_obj_has_trans( *a ) )
+ {
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+ }
+
+ area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
+ FALSE, start, end );
}
else // if dense or zeros
{
- bli_get_range_l2r( thr, all_start, all_end, block_factor,
- start, end );
+ area = bli_get_range_l2r( thr, a, bf,
+ start, end );
}
+
+ return area;
}
-void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- if ( bli_is_upper_or_lower( uplo ) )
+ siz_t area;
+
+ // This function assigns area-weighted ranges in the n dimension
+ // where the total range spans 0 to n-1 with 0 at the right end and
+ // n-1 at the left end.
+
+ if ( bli_obj_intersects_diag( *a ) &&
+ bli_obj_is_upper_or_lower( *a ) )
{
- //printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
- bli_toggle_uplo( uplo );
- bli_get_range_weighted( thr, all_start, all_end, block_factor,
- uplo, TRUE, start, end );
+ doff_t diagoff = bli_obj_diag_offset( *a );
+ uplo_t uplo = bli_obj_uplo( *a );
+ dim_t m = bli_obj_length( *a );
+ dim_t n = bli_obj_width( *a );
+
+ // Support implicit transposition.
+ if ( bli_obj_has_trans( *a ) )
+ {
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+ }
+
+ bli_rotate180_trapezoid( diagoff, uplo );
+
+ area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
+ TRUE, start, end );
}
else // if dense or zeros
{
- //printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
- bli_get_range_r2l( thr, all_start, all_end, block_factor,
- start, end );
+ area = bli_get_range_r2l( thr, a, bf,
+ start, end );
}
+
+ return area;
}
-void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- if ( bli_is_upper_or_lower( uplo ) )
+ siz_t area;
+
+ // This function assigns area-weighted ranges in the m dimension
+ // where the total range spans 0 to m-1 with 0 at the top end and
+ // m-1 at the bottom end.
+
+ if ( bli_obj_intersects_diag( *a ) &&
+ bli_obj_is_upper_or_lower( *a ) )
{
- bli_toggle_uplo( uplo );
- bli_get_range_weighted( thr, all_start, all_end, block_factor,
- uplo, FALSE, start, end );
+ doff_t diagoff = bli_obj_diag_offset( *a );
+ uplo_t uplo = bli_obj_uplo( *a );
+ dim_t m = bli_obj_length( *a );
+ dim_t n = bli_obj_width( *a );
+
+ // Support implicit transposition.
+ if ( bli_obj_has_trans( *a ) )
+ {
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+ }
+
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+
+ area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
+ FALSE, start, end );
}
else // if dense or zeros
{
- bli_get_range_t2b( thr, all_start, all_end, block_factor,
- start, end );
+ area = bli_get_range_t2b( thr, a, bf,
+ start, end );
}
+
+ return area;
}
-void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
- if ( bli_is_upper_or_lower( uplo ) )
+ siz_t area;
+
+ // This function assigns area-weighted ranges in the m dimension
+ // where the total range spans 0 to m-1 with 0 at the bottom end and
+ // m-1 at the top end.
+
+ if ( bli_obj_intersects_diag( *a ) &&
+ bli_obj_is_upper_or_lower( *a ) )
{
- bli_get_range_weighted( thr, all_start, all_end, block_factor,
- uplo, TRUE, start, end );
+ doff_t diagoff = bli_obj_diag_offset( *a );
+ uplo_t uplo = bli_obj_uplo( *a );
+ dim_t m = bli_obj_length( *a );
+ dim_t n = bli_obj_width( *a );
+
+ // Support implicit transposition.
+ if ( bli_obj_has_trans( *a ) )
+ {
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+ }
+
+ bli_reflect_about_diag( diagoff, uplo, m, n );
+
+ bli_rotate180_trapezoid( diagoff, uplo );
+
+ area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
+ TRUE, start, end );
}
else // if dense or zeros
{
- bli_get_range_b2t( thr, all_start, all_end, block_factor,
- start, end );
+ area = bli_get_range_b2t( thr, a, bf,
+ start, end );
}
- }
+
+ return area;
+}
+
void bli_level3_thread_decorator( dim_t n_threads,
level3_int_t func,
obj_t* alpha,
diff --git a/src/ti/linalg/blis/frame/base/bli_threading.h b/src/ti/linalg/blis/frame/base/bli_threading.h
index 19c8118fca5bc24fbdb85528caf4962735c9c44f..9a0bd1b3110ffd2298a41f845f5183a263db58de 100644 (file)
//void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
//void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
-void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor,
- bool_t handle_edge_low,
+void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low,
dim_t* start, dim_t* end );
-void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor,
- dim_t* start, dim_t* end );
-void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor,
- dim_t* start, dim_t* end );
-void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor,
- dim_t* start, dim_t* end );
-void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor,
- dim_t* start, dim_t* end );
-
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor, uplo_t uplo,
- bool_t handle_edge_low,
- dim_t* start, dim_t* end );
-void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor, uplo_t uplo,
- dim_t* start, dim_t* end );
-void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor, uplo_t uplo,
- dim_t* start, dim_t* end );
-void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor, uplo_t uplo,
- dim_t* start, dim_t* end );
-void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
- dim_t block_factor, uplo_t uplo,
- dim_t* start, dim_t* end );
+siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+
+dim_t bli_get_range_width_l( doff_t diagoff_j,
+ dim_t m,
+ dim_t n_j,
+ dim_t j,
+ dim_t n_way,
+ dim_t bf,
+ dim_t bf_left,
+ double area_per_thr,
+ bool_t handle_edge_low );
+siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff );
+siz_t bli_get_range_weighted( void* thr,
+ doff_t diagoff,
+ uplo_t uplo,
+ dim_t m,
+ dim_t n,
+ dim_t bf,
+ bool_t handle_edge_low,
+ dim_t* j_start_thr,
+ dim_t* j_end_thr );
+
+siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
+siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
diff --git a/src/ti/linalg/blis/frame/include/bli_param_macro_defs.h b/src/ti/linalg/blis/frame/include/bli_param_macro_defs.h
index 3b612989c8f0c4942960f3cf60b565f60bc79b0e..4418c8284033ba1e8a0dfb955c9f2bfd28698da4 100644 (file)
( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || \
( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) )
+// pruning-related
+
+#define bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ) \
+{ \
+ offm_inc = 0; \
+\
+ /* If the diagonal intersects the left side of the matrix,
+ ignore the area above that intersection. */ \
+ if ( diagoff < 0 ) \
+ { \
+ m = m + diagoff; \
+ offm_inc = - diagoff; \
+ diagoff = 0; \
+ } \
+}
+
+#define bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ) \
+{ \
+ offn_inc = 0; \
+\
+ /* If the diagonal intersects the bottom side of the matrix,
+ ignore the area to the right of that intersection. */ \
+ if ( n > diagoff + m ) \
+ { \
+ n = diagoff + m; \
+ } \
+}
+
+#define bli_prune_unstored_region_left_u( diagoff, m, n, offn_inc ) \
+{ \
+ offn_inc = 0; \
+\
+ /* If the diagonal intersects the top side of the matrix,
+ ignore the area to the left of that intersection. */ \
+ if ( diagoff > 0 ) \
+ { \
+ n = n - diagoff; \
+ offn_inc = + diagoff; \
+ diagoff = 0; \
+ } \
+}
+
+#define bli_prune_unstored_region_bottom_u( diagoff, m, n, offm_inc ) \
+{ \
+ offm_inc = 0; \
+\
+ /* If the diagonal intersects the right side of the matrix,
+ ignore the area below that intersection. */ \
+ if ( m > -diagoff + n ) \
+ { \
+ m = -diagoff + n; \
+ } \
+}
+
+
+// thread range-related
+
+#define bli_rotate180_trapezoid( diagoff, uplo ) \
+{ \
+ diagoff = n - diagoff - m; \
+ bli_toggle_uplo( uplo ); \
+}
+
+#define bli_reverse_index_direction( start, end, n ) \
+{ \
+ dim_t start2 = n - start; \
+ dim_t end2 = n - end; \
+ start = end2; \
+ end = start2; \
+}
+
+#define bli_reflect_about_diag( diagoff, uplo, m, n ) \
+{ \
+ bli_swap_dims( m, n ); \
+ bli_negate_diag_offset( diagoff ); \
+ bli_toggle_uplo( uplo ); \
+}
+
// index-related
diff --git a/src/ti/linalg/blis/frame/include/bli_scalar_macro_defs.h b/src/ti/linalg/blis/frame/include/bli_scalar_macro_defs.h
index 832dd9f483d15f4f05fac93d532c1d723a83a77c..5bd94612102070b7b68f995dbb73d256edad43d1 100644 (file)
bli_fmax( bli_fabs( a ), \
bli_fabs( b ) )
+// round
+
+#define bli_round( val ) \
+\
+ ( round( val ) )
+
+// round_to_mult
+
+#define bli_round_to_mult( val, mult ) \
+\
+ ( guint_t )( ( ( ( guint_t )val + \
+ ( guint_t )mult / 2 \
+ ) / mult \
+ ) * mult \
+ )
+
// isnan, isinf
#define bli_isinf( a ) isinf( a )
diff --git a/src/ti/linalg/blis/testsuite/input.operations b/src/ti/linalg/blis/testsuite/input.operations
index 863fd31587a60e8b3c0429209f7fd94f15eabbc9..695c07bf8965c1ff445edf7101ac45a4d1d6c496 100644 (file)
1 # herk
1 # test sequential front-end
-1 -1 # dimensions: m k
-?n # parameters: uploc transa
+?? # parameters: uploc transa
1 # her2k
1 # test sequential front-end
1 # trmm
1 # test sequential front-end
-1 -1 # dimensions: m n
-??nn # parameters: side uploa transa diaga
+???? # parameters: side uploa transa diaga
0 # trmm3
1 # test sequential front-end
1 # trsm
1 # test sequential front-end
-1 -1 # dimensions: m n
-??nn # parameters: side uploa transa diaga
+???? # parameters: side uploa transa diaga