clang 20.0.0 (based on r547379) from build 12806354. Bug: http://b/379133546 Test: N/A Change-Id: I2eb8938af55d809de674be63cb30cf27e801862b Upstream-Commit: ad834e67b1105d15ef907f6255d4c96e8e733f57
75 lines
3.0 KiB
C++
75 lines
3.0 KiB
C++
//===- MatmulOptimizer.h -------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef POLLY_MATMULOPTIMIZER_H
|
|
#define POLLY_MATMULOPTIMIZER_H
|
|
|
|
#include "isl/isl-noexceptions.h"
|
|
|
|
namespace llvm {
|
|
class TargetTransformInfo;
|
|
}
|
|
|
|
namespace polly {
|
|
class Dependences;
|
|
|
|
/// Apply the BLIS matmul optimization pattern if possible.
|
|
///
|
|
/// Make the loops containing the matrix multiplication be the innermost
|
|
/// loops and apply the BLIS matmul optimization pattern. BLIS implements
|
|
/// gemm as three nested loops around a macro-kernel, plus two packing
|
|
/// routines. The macro-kernel is implemented in terms of two additional
|
|
/// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
|
|
/// (i.e., outer product) update.
|
|
///
|
|
/// For a detailed description please see [1].
|
|
///
|
|
/// The order of the loops defines the data reused in the BLIS implementation
|
|
/// of gemm ([1]). In particular, elements of the matrix B, the second
|
|
/// operand of matrix multiplication, are reused between iterations of the
|
|
/// innermost loop. To keep the reused data in cache, only elements of matrix
|
|
/// A, the first operand of matrix multiplication, should be evicted during
|
|
/// an iteration of the innermost loop. To provide such a cache replacement
|
|
/// policy, elements of the matrix A can, in particular, be loaded first and,
|
|
/// consequently, be least-recently-used.
|
|
///
|
|
/// In our case matrices are stored in row-major order instead of
|
|
/// column-major order used in the BLIS implementation ([1]). It affects only
|
|
/// on the form of the BLIS micro kernel and the computation of its
|
|
/// parameters. In particular, reused elements of the matrix B are
|
|
/// successively multiplied by specific elements of the matrix A.
|
|
///
|
|
/// Refs.:
|
|
/// [1] - Analytical Modeling is Enough for High Performance BLIS
|
|
/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
|
|
/// Technical Report, 2014
|
|
/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
|
|
///
|
|
/// @see ScheduleTreeOptimizer::createMicroKernel
|
|
/// @see ScheduleTreeOptimizer::createMacroKernel
|
|
/// @see getMicroKernelParams
|
|
/// @see getMacroKernelParams
|
|
///
|
|
/// TODO: Implement the packing transformation.
|
|
///
|
|
/// @param Node The node that contains a band to be optimized. The node
|
|
/// is required to successfully pass
|
|
/// ScheduleTreeOptimizer::isMatrMultPattern.
|
|
/// @param TTI Target Transform Info.
|
|
/// @param D The dependencies.
|
|
///
|
|
/// @returns The transformed schedule or nullptr if the optimization
|
|
/// cannot be applied.
|
|
isl::schedule_node
|
|
tryOptimizeMatMulPattern(isl::schedule_node Node,
|
|
const llvm::TargetTransformInfo *TTI,
|
|
const Dependences *D);
|
|
|
|
} // namespace polly
|
|
#endif // POLLY_MATMULOPTIMIZER_H
|