1 /*
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
7 Copyright (C) 2014, The University of Texas at Austin
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
35 #include "blis.h"
37 #define FUNCPTR_T her2_fp
39 typedef void (*FUNCPTR_T)(
40 uplo_t uplo,
41 conj_t conjx,
42 conj_t conjy,
43 conj_t conjh,
44 dim_t m,
45 void* alpha,
46 void* x, inc_t incx,
47 void* y, inc_t incy,
48 void* c, inc_t rs_c, inc_t cs_c
49 );
51 // If some mixed datatype functions will not be compiled, we initialize
52 // the corresponding elements of the function array to NULL.
53 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
54 static FUNCPTR_T GENARRAY3_ALL(ftypes,her2_unb_var1);
55 #else
56 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
57 static FUNCPTR_T GENARRAY3_EXT(ftypes,her2_unb_var1);
58 #else
59 static FUNCPTR_T GENARRAY3_MIN(ftypes,her2_unb_var1);
60 #endif
61 #endif
64 void bli_her2_unb_var1( conj_t conjh,
65 obj_t* alpha,
66 obj_t* alpha_conj,
67 obj_t* x,
68 obj_t* y,
69 obj_t* c,
70 her2_t* cntl )
71 {
72 num_t dt_x = bli_obj_datatype( *x );
73 num_t dt_y = bli_obj_datatype( *y );
74 num_t dt_c = bli_obj_datatype( *c );
76 uplo_t uplo = bli_obj_uplo( *c );
77 conj_t conjx = bli_obj_conj_status( *x );
78 conj_t conjy = bli_obj_conj_status( *y );
80 dim_t m = bli_obj_length( *c );
82 void* buf_x = bli_obj_buffer_at_off( *x );
83 inc_t incx = bli_obj_vector_inc( *x );
85 void* buf_y = bli_obj_buffer_at_off( *y );
86 inc_t incy = bli_obj_vector_inc( *y );
88 void* buf_c = bli_obj_buffer_at_off( *c );
89 inc_t rs_c = bli_obj_row_stride( *c );
90 inc_t cs_c = bli_obj_col_stride( *c );
92 num_t dt_alpha;
93 void* buf_alpha;
95 FUNCPTR_T f;
97 // The datatype of alpha MUST be the type union of the datatypes of x and y.
98 dt_alpha = bli_datatype_union( dt_x, dt_y );
99 buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );
101 // Index into the type combination array to extract the correct
102 // function pointer.
103 f = ftypes[dt_x][dt_y][dt_c];
105 // Invoke the function.
106 f( uplo,
107 conjx,
108 conjy,
109 conjh,
110 m,
111 buf_alpha,
112 buf_x, incx,
113 buf_y, incy,
114 buf_c, rs_c, cs_c );
115 }
118 #undef GENTFUNC3U12
119 #define GENTFUNC3U12( ctype_x, ctype_y, ctype_c, ctype_xy, chx, chy, chc, chxy, varname, kername ) \
120 \
121 void PASTEMAC3(chx,chy,chc,varname)( \
122 uplo_t uplo, \
123 conj_t conjx, \
124 conj_t conjy, \
125 conj_t conjh, \
126 dim_t m, \
127 void* alpha, \
128 void* x, inc_t incx, \
129 void* y, inc_t incy, \
130 void* c, inc_t rs_c, inc_t cs_c \
131 ) \
132 { \
133 ctype_xy* two = PASTEMAC(chxy,2); \
134 ctype_xy* alpha_cast = alpha; \
135 ctype_x* x_cast = x; \
136 ctype_y* y_cast = y; \
137 ctype_c* c_cast = c; \
138 ctype_x* x0; \
139 ctype_x* chi1; \
140 ctype_y* y0; \
141 ctype_y* psi1; \
142 ctype_c* c10t; \
143 ctype_c* gamma11; \
144 ctype_xy alpha0; \
145 ctype_xy alpha1; \
146 ctype_xy alpha0_chi1; \
147 ctype_xy alpha1_psi1; \
148 ctype_xy alpha0_chi1_psi1; \
149 ctype_x conjx0_chi1; \
150 ctype_y conjy1_psi1; \
151 ctype_y conjy0_psi1; \
152 dim_t i; \
153 dim_t n_behind; \
154 inc_t rs_ct, cs_ct; \
155 conj_t conj0, conj1; \
156 \
157 if ( bli_zero_dim1( m ) ) return; \
158 \
159 if ( PASTEMAC(chxy,eq0)( *alpha_cast ) ) return; \
160 \
161 /* The algorithm will be expressed in terms of the lower triangular case;
162 the upper triangular case is supported by swapping the row and column
163 strides of A and toggling some conj parameters. */ \
164 if ( bli_is_lower( uplo ) ) \
165 { \
166 rs_ct = rs_c; \
167 cs_ct = cs_c; \
168 \
169 PASTEMAC2(chxy,chxy,copys)( *alpha_cast, alpha0 ); \
170 PASTEMAC2(chxy,chxy,copycjs)( conjh, *alpha_cast, alpha1 ); \
171 } \
172 else /* if ( bli_is_upper( uplo ) ) */ \
173 { \
174 rs_ct = cs_c; \
175 cs_ct = rs_c; \
176 \
177 /* Toggle conjugation of conjx/conjy, but only if we are being invoked
178 as her2; for syr2, conjx/conjy are unchanged. */ \
179 conjx = bli_apply_conj( conjh, conjx ); \
180 conjy = bli_apply_conj( conjh, conjy ); \
181 \
182 PASTEMAC2(chxy,chxy,copycjs)( conjh, *alpha_cast, alpha0 ); \
183 PASTEMAC2(chxy,chxy,copys)( *alpha_cast, alpha1 ); \
184 } \
185 \
186 /* Apply conjh (which carries the conjugation component of the Hermitian
187 transpose, if applicable) to conjx and/or conjy as needed to arrive at
188 the effective conjugation for the vector subproblems. */ \
189 conj0 = bli_apply_conj( conjh, conjy ); \
190 conj1 = bli_apply_conj( conjh, conjx ); \
191 \
192 for ( i = 0; i < m; ++i ) \
193 { \
194 n_behind = i; \
195 x0 = x_cast + (0 )*incx; \
196 chi1 = x_cast + (i )*incx; \
197 y0 = y_cast + (0 )*incy; \
198 psi1 = y_cast + (i )*incy; \
199 c10t = c_cast + (i )*rs_ct + (0 )*cs_ct; \
200 gamma11 = c_cast + (i )*rs_ct + (i )*cs_ct; \
201 \
202 /* Apply conjx and/or conjy to chi1 and/or psi1. */ \
203 PASTEMAC2(chx,chx,copycjs)( conjx, *chi1, conjx0_chi1 ); \
204 PASTEMAC2(chy,chy,copycjs)( conjy, *psi1, conjy1_psi1 ); \
205 PASTEMAC2(chy,chy,copycjs)( conj0, *psi1, conjy0_psi1 ); \
206 \
207 /* Compute scalars for vector subproblems. */ \
208 PASTEMAC3(chxy,chx,chxy,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
209 PASTEMAC3(chxy,chx,chxy,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
210 \
211 /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
212 already been conjugated, if needed, by conjx and conjy. */ \
213 PASTEMAC3(chy,chxy,chxy,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
214 \
215 /* c10t = c10t + alpha * chi1 * y0'; */ \
216 PASTEMAC3(chxy,chy,chc,kername)( conj0, \
217 n_behind, \
218 &alpha0_chi1, \
219 y0, incy, \
220 c10t, cs_ct ); \
221 \
222 /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
223 PASTEMAC3(chxy,chx,chc,kername)( conj1, \
224 n_behind, \
225 &alpha1_psi1, \
226 x0, incx, \
227 c10t, cs_ct ); \
228 \
229 /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
230 + conj(alpha) * psi1 * conj(chi1); */ \
231 PASTEMAC3(chxy,chxy,chc,axpys)( *two, alpha0_chi1_psi1, *gamma11 ); \
232 \
233 /* For her2, explicitly set the imaginary component of gamma11 to
234 zero. */ \
235 if ( bli_is_conj( conjh ) ) \
236 PASTEMAC(chc,seti0s)( *gamma11 ); \
237 } \
238 }
240 // Define the basic set of functions unconditionally, and then also some
241 // mixed datatype functions if requested.
242 INSERT_GENTFUNC3U12_BASIC( her2_unb_var1, AXPYV_KERNEL )
244 #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
245 INSERT_GENTFUNC3U12_MIX_D( her2_unb_var1, AXPYV_KERNEL )
246 #endif
248 #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
249 INSERT_GENTFUNC3U12_MIX_P( her2_unb_var1, AXPYV_KERNEL )
250 #endif