blis/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c

   1 /*
   2
   3    BLIS
   4    An object-based framework for developing high-performance BLAS-like
   5    libraries.
   6
   7    Copyright (C) 2014, The University of Texas at Austin
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions are
  11    met:
  12     - Redistributions of source code must retain the above copyright
  13       notice, this list of conditions and the following disclaimer.
  14     - Redistributions in binary form must reproduce the above copyright
  15       notice, this list of conditions and the following disclaimer in the
  16       documentation and/or other materials provided with the distribution.
  17     - Neither the name of The University of Texas at Austin nor the names
  18       of its contributors may be used to endorse or promote products
  19       derived from this software without specific prior written permission.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 */
  34
  35 #include "blis.h"
  36
  37
  38
  39 void bli_sgemmtrsm_u_opt_mxn(
  40                               dim_t              k,
  41                               float*    restrict alpha,
  42                               float*    restrict a12,
  43                               float*    restrict a11,
  44                               float*    restrict b21,
  45                               float*    restrict b11,
  46                               float*    restrict c11, inc_t rs_c, inc_t cs_c,
  47                               auxinfo_t*         data
  48                             )
  49 {
  50         const inc_t        rs_b      = bli_spacknr;
  51         const inc_t        cs_b      = 1;
  52
  53         float*    restrict minus_one = bli_sm1;
  54
  55
  56         bli_sgemm_opt_mxn( k,
  57                            minus_one,
  58                            a12,
  59                            b21,
  60                            alpha,
  61                            b11, rs_b, cs_b,
  62                            data );
  63
  64         bli_strsm_u_opt_mxn( a11,
  65                              b11,
  66                              c11, rs_c, cs_c,
  67                              data );
  68 }
  69
  70
  71
  72 void bli_dgemmtrsm_u_opt_mxn(
  73                               dim_t              k,
  74                               double*   restrict alpha,
  75                               double*   restrict a12,
  76                               double*   restrict a11,
  77                               double*   restrict b21,
  78                               double*   restrict b11,
  79                               double*   restrict c11, inc_t rs_c, inc_t cs_c,
  80                               auxinfo_t*         data
  81                             )
  82 {
  83 /*
  84   Template gemmtrsm_u micro-kernel implementation
  85
  86   This function contains a template implementation for a double-precision
  87   real micro-kernel that fuses a gemm with a trsm_u subproblem.
  88
  89   This micro-kernel performs the following compound operation:
  90
  91     B11 := alpha * B11 - A12 * B21    (gemm)
  92     B11 := inv(A11) * B11             (trsm)
  93     C11 := B11
  94
  95   where A11 is MR x MR and upper triangular, A12 is MR x k, B21 is k x NR,
  96   B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix
  97   inverse.
  98
  99   Parameters:
 100
 101   - k:      The number of columns of A12 and rows of B21.
 102   - alpha:  The address of a scalar to be applied to B11.
 103   - a12:    The address of A12, which is the MR x k submatrix of the packed
 104             micro-panel of A that is situated to the right of the MR x MR
 105             triangular submatrix A11. A12 is stored by columns with leading
 106             dimension PACKMR, where typically PACKMR = MR.
 107   - a11:    The address of A11, which is the MR x MR upper triangular
 108             submatrix within the packed micro-panel of matrix A that is
 109             situated to the left of A12. A11 is stored by columns with
 110             leading dimension PACKMR, where typically PACKMR = MR. Note
 111             that A11 contains elements in both triangles, though elements
 112             in the unstored triangle are not guaranteed to be zero and
 113             thus should not be referenced.
 114   - b21:    The address of B21, which is the k x NR submatrix of the packed
 115             micro-panel of B that is situated above the MR x NR submatrix
 116             B11. B01 is stored by rows with leading dimension PACKNR, where
 117             typically PACKNR = NR.
 118   - b11:    The address B11, which is the MR x NR submatrix of the packed
 119             micro-panel of B, situated below B01. B11 is stored by rows
 120             with leading dimension PACKNR, where typically PACKNR = NR.
 121   - c11:    The address of C11, which is the MR x NR submatrix of matrix
 122             C, stored according to rs_c and cs_c. C11 is the submatrix
 123             within C that corresponds to the elements which were packed
 124             into B11. Thus, C is the original input matrix B to the overall
 125             trsm operation.
 126   - rs_c:   The row stride of C11 (ie: the distance to the next row of C11,
 127             in units of matrix elements).
 128   - cs_c:   The column stride of C11 (ie: the distance to the next column of
 129             C11, in units of matrix elements).
 130   - data:   The address of an auxinfo_t object that contains auxiliary
 131             information that may be useful when optimizing the gemmtrsm
 132             micro-kernel implementation. (See BLIS KernelsHowTo wiki for
 133             more info.)
 134
 135   Diagram for gemmtrsm_u
 136
 137   The diagram below shows the packed micro-panel operands for trsm_l and
 138   how elements of each would be stored when MR = NR = 4. (The hex digits
 139   indicate the layout and order (but NOT the numeric contents) in memory.
 140   Here, matrix A11 (referenced by a11) is upper triangular. Matrix A11
 141   does contain elements corresponding to the strictly lower triangle,
 142   however, they are not guaranteed to contain zeros and thus these elements
 143   should not be referenced.
 144
 145        a11:     a12:                          NR
 146        ________ ___________________         _______
 147       |`.      |0 4 8              |   b11:|0 1 2 3|
 148   MR  |  `.    |1 5 9 . . .        |       |4 5 6 7|
 149       |    `.  |2 6 A              |    MR |8 9 A B|
 150       |______`.|3_7_B______________|       |___.___|
 151                                        b21:|   .   |
 152           MR             k                 |   .   |
 153                                            |       |
 154                                            |       |
 155     NOTE: Storage digits are shown       k |       |
 156     starting with a12 to avoid             |       |
 157     obscuring triangular structure         |       |
 158     of a11.                                |_______|
 159
 160
 161   Implementation Notes for gemmtrsm
 162
 163   - Register blocksizes. See Implementation Notes for gemm.
 164   - Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation
 165     Notes for gemm.
 166   - Edge cases in MR, NR dimensions. See Implementation Notes for gemm.
 167   - Alignment of a1 and b1. The addresses a1 and b1 are aligned according
 168     to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively.
 169   - Unrolling loops. Most optimized implementations should unroll all
 170     three loops within the trsm subproblem of gemmtrsm. See Implementation
 171     Notes for gemm for remarks on unrolling the gemm subproblem.
 172   - Prefetching next micro-panels of A and B. When invoked from within a
 173     gemmtrsm_l micro-kernel, the addresses accessible via
 174     bli_auxinfo_next_a() and bli_auxinfo_next_b() refer to the next
 175     invocation's a10 and b01, respectively, while in gemmtrsm_u, the
 176     _next_a() and _next_b() macros return the addresses of the next
 177     invocation's a11 and b11 (since those submatrices precede a12 and b21).
 178     (See BLIS KernelsHowTo wiki for more info.)
 179   - Zero alpha. The micro-kernel can safely assume that alpha is non-zero;
 180     "alpha equals zero" handling is performed at a much higher level,
 181     which means that, in such a scenario, the micro-kernel will never get
 182     called.
 183   - Diagonal elements of A11. See Implementation Notes for trsm.
 184   - Zero elements of A11. See Implementation Notes for trsm.
 185   - Output. See Implementation Notes for trsm.
 186   - Optimization. Let's assume that the gemm micro-kernel has already been
 187     optimized. You have two options with regard to optimizing the fused
 188     gemmtrsm micro-kernels:
 189     (1) Optimize only the trsm micro-kernels. This will result in the gemm
 190         and trsm_l micro-kernels being called in sequence. (Likewise for
 191         gemm and trsm_u.)
 192     (2) Fuse the implementation of the gemm micro-kernel with that of the
 193         trsm micro-kernels by inlining both into the gemmtrsm_l and
 194         gemmtrsm_u micro-kernel definitions. This option is more labor-
 195         intensive, but also more likely to yield higher performance because
 196         it avoids redundant memory operations on the packed MR x NR
 197         submatrix B11.
 198
 199   For more info, please refer to the BLIS website and/or contact the
 200   blis-devel mailing list.
 201
 202 */
 203         const inc_t        rs_b      = bli_dpacknr;
 204         const inc_t        cs_b      = 1;
 205
 206         double*   restrict minus_one = bli_dm1;
 207
 208         /* b11 = alpha * b11 - a12 * b21; */
 209         bli_dgemm_opt_mxn( k,
 210                            minus_one,
 211                            a12,
 212                            b21,
 213                            alpha,
 214                            b11, rs_b, cs_b,
 215                            data );
 216
 217         /* b11 = inv(a11) * b11;
 218            c11 = b11; */
 219         bli_dtrsm_u_opt_mxn( a11,
 220                              b11,
 221                              c11, rs_c, cs_c,
 222                              data );
 223 }
 224
 225
 226
 227 void bli_cgemmtrsm_u_opt_mxn(
 228                               dim_t              k,
 229                               scomplex* restrict alpha,
 230                               scomplex* restrict a12,
 231                               scomplex* restrict a11,
 232                               scomplex* restrict b21,
 233                               scomplex* restrict b11,
 234                               scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
 235                               auxinfo_t*         data
 236                             )
 237 {
 238         const inc_t        rs_b      = bli_cpacknr;
 239         const inc_t        cs_b      = 1;
 240
 241         scomplex* restrict minus_one = bli_cm1;
 242
 243
 244         bli_cgemm_opt_mxn( k,
 245                            minus_one,
 246                            a12,
 247                            b21,
 248                            alpha,
 249                            b11, rs_b, cs_b,
 250                            data );
 251
 252         bli_ctrsm_u_opt_mxn( a11,
 253                              b11,
 254                              c11, rs_c, cs_c,
 255                              data );
 256 }
 257
 258
 259
 260 void bli_zgemmtrsm_u_opt_mxn(
 261                               dim_t              k,
 262                               dcomplex* restrict alpha,
 263                               dcomplex* restrict a12,
 264                               dcomplex* restrict a11,
 265                               dcomplex* restrict b21,
 266                               dcomplex* restrict b11,
 267                               dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
 268                               auxinfo_t*         data
 269                             )
 270 {
 271         const inc_t        rs_b      = bli_zpacknr;
 272         const inc_t        cs_b      = 1;
 273
 274         dcomplex* restrict minus_one = bli_zm1;
 275
 276
 277         bli_zgemm_opt_mxn( k,
 278                            minus_one,
 279                            a12,
 280                            b21,
 281                            alpha,
 282                            b11, rs_b, cs_b,
 283                            data );
 284
 285         bli_ztrsm_u_opt_mxn( a11,
 286                              b11,
 287                              c11, rs_c, cs_c,
 288                              data );
 289 }
 290