1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #ifndef BLIS_CONFIG_H
36 #define BLIS_CONFIG_H
38 #define BLIS_ENABLE_C66X_BUILD
40 #define BLIS_ENABLE_C66X_MEM_POOLS
43 #ifdef BLIS_ENABLE_C66X_OPENCL
44 // clocl creates a cio section in L2 when fprintf is used. Redefining fprintf to map to printf.
45 #define fprintf ti_printf
46 #endif
51 // -- OPERATING SYSTEM ---------------------------------------------------------
53 // -- INTEGER PROPERTIES -------------------------------------------------------
55 // The bit size of the integer type used to track values such as dimensions,
56 // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
57 // integers while 64 results in 64-bit integers. Any other value results in use
58 // of the C99 type "long int". Note that this ONLY affects integers used
59 // internally within BLIS as well as those exposed in the native BLAS-like BLIS
60 // interface.
61 #define BLIS_INT_TYPE_SIZE 32
65 // -- FLOATING-POINT PROPERTIES ------------------------------------------------
67 // Define the number of floating-point types supported, and the size of the
68 // largest type.
69 #define BLIS_NUM_FP_TYPES 4
70 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
72 // Enable use of built-in C99 "float complex" and "double complex" types and
73 // associated overloaded operations and functions? Disabling results in
74 // scomplex and dcomplex being defined in terms of simple structs.
75 //#define BLIS_ENABLE_C99_COMPLEX
77 // -- c66x headers -------------------------------------------------------------
78 #include "c6x.h"
80 //#include <ti/csl/device/k2h/src/cslr_device.h>
82 #include <libarch.h>
83 #include <ti/csl/csl_chipAux.h> // CSL_chipReadDNUM -> to read coreID
84 #include <ti/csl/csl_cacheAux.h> // CACHE_invL1d
86 // for __clock64()
87 //#include <dsp_c.h>
89 // -- EDMA ---------------------------------------------------------------------
90 #define BLIS_ENABLE_C66X_EDMA
92 #ifdef BLIS_ENABLE_C66X_EDMA
94 #if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
96 #define BLIS_GEMM_DMAA_CNTL gemm_dmaa_cntl
97 #define BLIS_GEMM_DMAB_CNTL gemm_dmab_cntl
99 #elif defined (MEM_MODEL_SMALL)
100 #define BLIS_GEMM_DMAA_CNTL NULL
101 #define BLIS_GEMM_DMAB_CNTL NULL
102 #endif
104 /*
105 #if USING_FC_EDMAMGR
106 #include <xdc/std.h>
108 #define ECPY_INLINE_ALL 1
109 #define EDMAMGR_INLINE_ALL 1
110 #include <ti/sdo/fc/edmamgr/edmamgr.h>
111 #else
112 #include "edmamgr.h"
113 #endif
114 */
116 #define BLIS_C66X_MAXDMASTRIDE 0x7FFF
118 #define BLIS_C66X_EDMA_MAX_NUM_CHANNELS 6
119 #endif
121 // -- PREFETCH -----------------------------------------------------------------
122 //#define BLIS_ENABLE_C66X_PREFETCH
124 #ifdef BLIS_ENABLE_C66X_PREFETCH
125 #include "touch.h"
126 #endif
128 // -- IDMA -----------------------------------------------------------------
129 #define BLIS_ENABLE_C66X_IDMA
131 #ifdef BLIS_ENABLE_C66X_IDMA
132 #include "idma.h"
133 #endif
135 // -- PROFILE -----------------------------------------------------------------
136 //uncomment to Profile performance
137 //#define BLIS_ENABLE_PROFILE
139 // -- MULTITHREADING -----------------------------------------------------------
141 // The maximum number of BLIS threads that will run concurrently.
143 /* While testing this code on Hawking, the value of BLIS_MAX_NUM_THREADS
144 * needs to be 8. OpenMP randomly assigns the OpenMP threads to the cores.
145 * This value needs to be 8 to make sure all the cores are initialized
146 * before the openMP region begins.
147 *
148 * When porting to the specific architecture. Change BLIS_MAX_NUM_THREADS to the
149 * number of cores available on the device, and change BLIS_C66X_IC_NT to
150 * BLIS_MAX_NUM_THREADS
151 */
152 #define BLIS_ENABLE_MULTITHREADING
153 #define BLIS_ENABLE_OPENMP
154 #define BLIS_MAX_NUM_THREADS 8
156 #if defined(MEM_MODEL_LARGE) || defined (MEM_MODEL_MEDIUM)
157 #define BLIS_MAX_NUM_THREADS 8
158 #define BLIS_C66X_IC_NT BLIS_MAX_NUM_THREADS
159 #elif defined (MEM_MODEL_SMALL)
160 #define BLIS_MAX_NUM_THREADS 8
161 #define BLIS_C66X_IC_NT 8
162 #endif
164 #define BLIS_C66X_JC_NT 1
165 #define BLIS_C66X_JR_NT 1
166 #define BLIS_C66X_IR_NT 1
172 // -- MEMORY ALLOCATION --------------------------------------------------------
174 // -- Contiguous (static) memory allocator --
176 // The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
177 // contiguous memory pools.
179 #define BLIS_NUM_MC_X_KC_BLOCKS_L3 0
180 #define BLIS_NUM_MC_X_KC_BLOCKS_L2 2 //Each L2 ram is local to the DSP Just need one buffer per thread that is packed
181 #define BLIS_NUM_MC_X_KC_BLOCKS_L1 0
182 #define BLIS_NUM_MR_X_KC_BLOCKS_L1 2 // To transfer A to L1 in a ping-poing manner
183 #define BLIS_NUM_MC_X_KC_BLOCKS 2*BLIS_MAX_NUM_THREADS + 1 //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
185 #define BLIS_NUM_KC_X_NC_BLOCKS_L3 2 // Each thread shares a B block, so do not need 8 buffers *BLIS_MAX_NUM_THREADS // One for the partitioned B1, and one for the packed B1
186 #define BLIS_NUM_KC_X_NC_BLOCKS_L2 0
187 #define BLIS_NUM_KC_X_NC_BLOCKS_L1 0
188 #define BLIS_NUM_KC_X_NR_BLOCKS_L1 1
189 #define BLIS_NUM_KC_X_NC_BLOCKS 2*BLIS_MAX_NUM_THREADS //To test w/o DMA and L2, L3 memory, all memory must be in DDR3 now
191 #define BLIS_NUM_MC_X_NC_BLOCKS_L3 0
192 #define BLIS_NUM_MC_X_NC_BLOCKS_L2 0
193 #define BLIS_NUM_MC_X_NR_BLOCKS_L2 3 //Bringing C into the L2 memory. We need 3 buffers, one to read, one to compute and one to write.
194 #define BLIS_NUM_MC_X_NC_BLOCKS_L1 0
195 #define BLIS_NUM_MR_X_NR_BLOCKS_L1 0
196 #define BLIS_NUM_MC_X_NC_BLOCKS 0
199 // The maximum preload byte offset is used to pad the end of the contiguous
200 // memory pools so that the micro-kernel, when computing with the end of the
201 // last block, can exceed the bounds of the usable portion of the memory
202 // region without causing a segmentation fault.
203 #define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
205 // -- Memory alignment --
207 // It is sometimes useful to define the various memory alignments in terms
208 // of some other characteristics of the system, such as the cache line size
209 // and the page size.
210 #define BLIS_CACHE_LINE_SIZE 64
211 #define BLIS_PAGE_SIZE 4096
213 // Alignment size needed by the instruction set for aligned SIMD/vector
214 // instructions.
215 #define BLIS_SIMD_ALIGN_SIZE 16
217 // Alignment size used to align local stack buffers within macro-kernel
218 // functions.
219 #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
221 // Alignment size used when allocating memory dynamically from the operating
222 // system (eg: posix_memalign()). To disable heap alignment and just use
223 // malloc() instead, set this to 1.
224 #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
226 // Alignment size used when sizing leading dimensions of dynamically
227 // allocated memory.
228 #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
230 // Alignment size used when allocating entire blocks of contiguous memory
231 // from the contiguous memory allocator.
232 #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE//BLIS_PAGE_SIZE
234 // Extra buffer space in each block in L1 to account for bank conflict
235 /* There are 2 buffers of size MRK and 1 buffer of size KNR in L1. The
236 * extra buffer space in each block in L1 is computed based on the
237 * remaining space available in L1. L1DSRAM is configured to size 28K.
238 * The total size of the blocks in L1 = 2*MR*KC*size of datatype + KC*NR* size of datatype.
239 * The remaining available space in L1 is divided such that
240 * 2*BLIS_MRK_BLOCK_BUFFER_L1+1*BLIS_KNR_PANEL_BUFFER_L1+0BLIS_MRNR_BLOCK_BUFFER_L1_S = remaining available space.
241 */
242 #define BLIS_MRK_BLOCK_BUFFER_L1_S 128 //
243 #define BLIS_MRK_BLOCK_BUFFER_L1_D 64 //
244 #define BLIS_MRK_BLOCK_BUFFER_L1_C 64 //
245 #define BLIS_MRK_BLOCK_BUFFER_L1_Z 32 //
246 #define BLIS_KNR_PANEL_BUFFER_L1_S 256 //
247 #define BLIS_KNR_PANEL_BUFFER_L1_D 128
248 #define BLIS_KNR_PANEL_BUFFER_L1_C 128
249 #define BLIS_KNR_PANEL_BUFFER_L1_Z 64
250 #define BLIS_MRNR_BLOCK_BUFFER_L1_S 128 //
251 #define BLIS_MRNR_BLOCK_BUFFER_L1_D 64 //
252 #define BLIS_MRNR_BLOCK_BUFFER_L1_C 64 //
253 #define BLIS_MRNR_BLOCK_BUFFER_L1_Z 32 //
255 //
257 #define bli_sbank 8
258 #define bli_dbank 16
259 #define bli_cbank 16
260 #define bli_zbank 24
264 // -- MIXED DATATYPE SUPPORT ---------------------------------------------------
266 // Basic (homogeneous) datatype support always enabled.
268 // Enable mixed domain operations?
269 //#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
271 // Enable extra mixed precision operations?
272 //#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
276 // -- MISCELLANEOUS OPTIONS ----------------------------------------------------
278 // Stay initialized after auto-initialization, unless and until the user
279 // explicitly calls bli_finalize().
280 #define BLIS_ENABLE_STAY_AUTO_INITIALIZED
284 // -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
286 // Enable the BLAS compatibility layer?
287 #define BLIS_ENABLE_BLAS2BLIS
289 // The bit size of the integer type used to track values such as dimensions and
290 // leading dimensions (ie: column strides) within the BLAS compatibility layer.
291 // A value of 32 results in the compatibility layer using 32-bit signed integers
292 // while 64 results in 64-bit integers. Any other value results in use of the
293 // C99 type "long int". Note that this ONLY affects integers used within the
294 // BLAS compatibility layer.
295 #define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
297 // Fortran-77 name-mangling macros.
298 #define PASTEF770(name) name ## _
299 #define PASTEF77(ch1,name) ch1 ## name ## _
300 #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
304 extern void * blasGetMemHandle();
306 #endif