include/polly/MatmulOptimizer.h

0001 //===- MatmulOptimizer.h -------------------------------------------------===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008
0009 #ifndef POLLY_MATMULOPTIMIZER_H
0010 #define POLLY_MATMULOPTIMIZER_H
0011
0012 #include "isl/isl-noexceptions.h"
0013
0014 namespace llvm {
0015 class TargetTransformInfo;
0016 }
0017
0018 namespace polly {
0019 class Dependences;
0020
0021 /// Apply the BLIS matmul optimization pattern if possible.
0022 ///
0023 /// Make the loops containing the matrix multiplication be the innermost
0024 /// loops and apply the BLIS matmul optimization pattern. BLIS implements
0025 /// gemm as three nested loops around a macro-kernel, plus two packing
0026 /// routines. The macro-kernel is implemented in terms of two additional
0027 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
0028 /// (i.e., outer product) update.
0029 ///
0030 /// For a detailed description please see [1].
0031 ///
0032 /// The order of the loops defines the data reused in the BLIS implementation
0033 /// of gemm ([1]). In particular, elements of the matrix B, the second
0034 /// operand of matrix multiplication, are reused between iterations of the
0035 /// innermost loop. To keep the reused data in cache, only elements of matrix
0036 /// A, the first operand of matrix multiplication, should be evicted during
0037 /// an iteration of the innermost loop. To provide such a cache replacement
0038 /// policy, elements of the matrix A can, in particular, be loaded first and,
0039 /// consequently, be least-recently-used.
0040 ///
0041 /// In our case matrices are stored in row-major order instead of
0042 /// column-major order used in the BLIS implementation ([1]). It affects only
0043 /// on the form of the BLIS micro kernel and the computation of its
0044 /// parameters. In particular, reused elements of the matrix B are
0045 /// successively multiplied by specific elements of the matrix A.
0046 ///
0047 /// Refs.:
0048 /// [1] - Analytical Modeling is Enough for High Performance BLIS
0049 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
0050 /// Technical Report, 2014
0051 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
0052 ///
0053 /// @see ScheduleTreeOptimizer::createMicroKernel
0054 /// @see ScheduleTreeOptimizer::createMacroKernel
0055 /// @see getMicroKernelParams
0056 /// @see getMacroKernelParams
0057 ///
0058 /// TODO: Implement the packing transformation.
0059 ///
0060 /// @param Node The node that contains a band to be optimized. The node
0061 ///             is required to successfully pass
0062 ///             ScheduleTreeOptimizer::isMatrMultPattern.
0063 /// @param TTI  Target Transform Info.
0064 /// @param D    The dependencies.
0065 ///
0066 /// @returns    The transformed schedule or nullptr if the optimization
0067 ///             cannot be applied.
0068 isl::schedule_node
0069 tryOptimizeMatMulPattern(isl::schedule_node Node,
0070                          const llvm::TargetTransformInfo *TTI,
0071                          const Dependences *D);
0072
0073 } // namespace polly
0074 #endif // POLLY_MATMULOPTIMIZER_H