clang 20.0.0 (based on r547379) from build 12806354. Bug: http://b/379133546 Test: N/A Change-Id: I2eb8938af55d809de674be63cb30cf27e801862b Upstream-Commit: ad834e67b1105d15ef907f6255d4c96e8e733f57
3514 lines
162 KiB
C++
3514 lines
162 KiB
C++
//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file defines the OpenMPIRBuilder class and helpers used as a convenient
|
|
// way to create LLVM instructions for OpenMP directives.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
|
|
#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
|
|
|
|
#include "llvm/Analysis/MemorySSAUpdater.h"
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
|
#include "llvm/IR/DebugLoc.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/Support/Allocator.h"
|
|
#include "llvm/TargetParser/Triple.h"
|
|
#include <forward_list>
|
|
#include <map>
|
|
#include <optional>
|
|
|
|
namespace llvm {
|
|
class CanonicalLoopInfo;
|
|
struct TargetRegionEntryInfo;
|
|
class OffloadEntriesInfoManager;
|
|
class OpenMPIRBuilder;
|
|
|
|
/// Move the instruction after an InsertPoint to the beginning of another
|
|
/// BasicBlock.
|
|
///
|
|
/// The instructions after \p IP are moved to the beginning of \p New which must
|
|
/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
|
|
/// \p New will be added such that there is no semantic change. Otherwise, the
|
|
/// \p IP insert block remains degenerate and it is up to the caller to insert a
|
|
/// terminator.
|
|
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
|
|
bool CreateBranch);
|
|
|
|
/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
|
|
/// insert location will stick to after the instruction before the insertion
|
|
/// point (instead of moving with the instruction the InsertPoint stores
|
|
/// internally).
|
|
void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
|
|
|
|
/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
|
|
/// (missing the terminator).
|
|
///
|
|
/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
|
|
/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
|
|
/// is true, a branch to the new successor will new created such that
|
|
/// semantically there is no change; otherwise the block of the insertion point
|
|
/// remains degenerate and it is the caller's responsibility to insert a
|
|
/// terminator. Returns the new successor block.
|
|
BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
|
|
llvm::Twine Name = {});
|
|
|
|
/// Split a BasicBlock at \p Builder's insertion point, even if the block is
|
|
/// degenerate (missing the terminator). Its new insert location will stick to
|
|
/// after the instruction before the insertion point (instead of moving with the
|
|
/// instruction the InsertPoint stores internally).
|
|
BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
|
|
llvm::Twine Name = {});
|
|
|
|
/// Split a BasicBlock at \p Builder's insertion point, even if the block is
|
|
/// degenerate (missing the terminator). Its new insert location will stick to
|
|
/// after the instruction before the insertion point (instead of moving with the
|
|
/// instruction the InsertPoint stores internally).
|
|
BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
|
|
|
|
/// Like splitBB, but reuses the current block's name for the new name.
|
|
BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
|
|
llvm::Twine Suffix = ".split");
|
|
|
|
/// Captures attributes that affect generating LLVM-IR using the
|
|
/// OpenMPIRBuilder and related classes. Note that not all attributes are
|
|
/// required for all classes or functions. In some use cases the configuration
|
|
/// is not necessary at all, because because the only functions that are called
|
|
/// are ones that are not dependent on the configuration.
|
|
class OpenMPIRBuilderConfig {
|
|
public:
|
|
/// Flag to define whether to generate code for the role of the OpenMP host
|
|
/// (if set to false) or device (if set to true) in an offloading context. It
|
|
/// is set when the -fopenmp-is-target-device compiler frontend option is
|
|
/// specified.
|
|
std::optional<bool> IsTargetDevice;
|
|
|
|
/// Flag for specifying if the compilation is done for an accelerator. It is
|
|
/// set according to the architecture of the target triple and currently only
|
|
/// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
|
|
/// the role of an OpenMP target device, so `IsTargetDevice` must also be true
|
|
/// if `IsGPU` is true. This restriction might be lifted if an accelerator-
|
|
/// like target with the ability to work as the OpenMP host is added, or if
|
|
/// the capabilities of the currently supported GPU architectures are
|
|
/// expanded.
|
|
std::optional<bool> IsGPU;
|
|
|
|
/// Flag for specifying if LLVMUsed information should be emitted.
|
|
std::optional<bool> EmitLLVMUsedMetaInfo;
|
|
|
|
/// Flag for specifying if offloading is mandatory.
|
|
std::optional<bool> OpenMPOffloadMandatory;
|
|
|
|
/// First separator used between the initial two parts of a name.
|
|
std::optional<StringRef> FirstSeparator;
|
|
/// Separator used between all of the rest consecutive parts of s name
|
|
std::optional<StringRef> Separator;
|
|
|
|
// Grid Value for the GPU target
|
|
std::optional<omp::GV> GridValue;
|
|
|
|
/// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
|
|
/// false`), this contains the list of offloading triples associated, if any.
|
|
SmallVector<Triple> TargetTriples;
|
|
|
|
OpenMPIRBuilderConfig();
|
|
OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
|
|
bool OpenMPOffloadMandatory,
|
|
bool HasRequiresReverseOffload,
|
|
bool HasRequiresUnifiedAddress,
|
|
bool HasRequiresUnifiedSharedMemory,
|
|
bool HasRequiresDynamicAllocators);
|
|
|
|
// Getters functions that assert if the required values are not present.
|
|
bool isTargetDevice() const {
|
|
assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
|
|
return *IsTargetDevice;
|
|
}
|
|
|
|
bool isGPU() const {
|
|
assert(IsGPU.has_value() && "IsGPU is not set");
|
|
return *IsGPU;
|
|
}
|
|
|
|
bool openMPOffloadMandatory() const {
|
|
assert(OpenMPOffloadMandatory.has_value() &&
|
|
"OpenMPOffloadMandatory is not set");
|
|
return *OpenMPOffloadMandatory;
|
|
}
|
|
|
|
omp::GV getGridValue() const {
|
|
assert(GridValue.has_value() && "GridValue is not set");
|
|
return *GridValue;
|
|
}
|
|
|
|
bool hasRequiresFlags() const { return RequiresFlags; }
|
|
bool hasRequiresReverseOffload() const;
|
|
bool hasRequiresUnifiedAddress() const;
|
|
bool hasRequiresUnifiedSharedMemory() const;
|
|
bool hasRequiresDynamicAllocators() const;
|
|
|
|
/// Returns requires directive clauses as flags compatible with those expected
|
|
/// by libomptarget.
|
|
int64_t getRequiresFlags() const;
|
|
|
|
// Returns the FirstSeparator if set, otherwise use the default separator
|
|
// depending on isGPU
|
|
StringRef firstSeparator() const {
|
|
if (FirstSeparator.has_value())
|
|
return *FirstSeparator;
|
|
if (isGPU())
|
|
return "_";
|
|
return ".";
|
|
}
|
|
|
|
// Returns the Separator if set, otherwise use the default separator depending
|
|
// on isGPU
|
|
StringRef separator() const {
|
|
if (Separator.has_value())
|
|
return *Separator;
|
|
if (isGPU())
|
|
return "$";
|
|
return ".";
|
|
}
|
|
|
|
void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
|
|
void setIsGPU(bool Value) { IsGPU = Value; }
|
|
void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
|
|
void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
|
|
void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
|
|
void setSeparator(StringRef S) { Separator = S; }
|
|
void setGridValue(omp::GV G) { GridValue = G; }
|
|
|
|
void setHasRequiresReverseOffload(bool Value);
|
|
void setHasRequiresUnifiedAddress(bool Value);
|
|
void setHasRequiresUnifiedSharedMemory(bool Value);
|
|
void setHasRequiresDynamicAllocators(bool Value);
|
|
|
|
private:
|
|
/// Flags for specifying which requires directive clauses are present.
|
|
int64_t RequiresFlags;
|
|
};
|
|
|
|
/// Data structure to contain the information needed to uniquely identify
|
|
/// a target entry.
|
|
struct TargetRegionEntryInfo {
|
|
/// The prefix used for kernel names.
|
|
static constexpr const char *KernelNamePrefix = "__omp_offloading_";
|
|
|
|
std::string ParentName;
|
|
unsigned DeviceID;
|
|
unsigned FileID;
|
|
unsigned Line;
|
|
unsigned Count;
|
|
|
|
TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
|
|
TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
|
|
unsigned FileID, unsigned Line, unsigned Count = 0)
|
|
: ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
|
|
Count(Count) {}
|
|
|
|
static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
|
|
StringRef ParentName,
|
|
unsigned DeviceID, unsigned FileID,
|
|
unsigned Line, unsigned Count);
|
|
|
|
bool operator<(const TargetRegionEntryInfo &RHS) const {
|
|
return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
|
|
std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
|
|
RHS.Count);
|
|
}
|
|
};
|
|
|
|
/// Class that manages information about offload code regions and data
|
|
class OffloadEntriesInfoManager {
|
|
/// Number of entries registered so far.
|
|
OpenMPIRBuilder *OMPBuilder;
|
|
unsigned OffloadingEntriesNum = 0;
|
|
|
|
public:
|
|
/// Base class of the entries info.
|
|
class OffloadEntryInfo {
|
|
public:
|
|
/// Kind of a given entry.
|
|
enum OffloadingEntryInfoKinds : unsigned {
|
|
/// Entry is a target region.
|
|
OffloadingEntryInfoTargetRegion = 0,
|
|
/// Entry is a declare target variable.
|
|
OffloadingEntryInfoDeviceGlobalVar = 1,
|
|
/// Invalid entry info.
|
|
OffloadingEntryInfoInvalid = ~0u
|
|
};
|
|
|
|
protected:
|
|
OffloadEntryInfo() = delete;
|
|
explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
|
|
explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
|
|
uint32_t Flags)
|
|
: Flags(Flags), Order(Order), Kind(Kind) {}
|
|
~OffloadEntryInfo() = default;
|
|
|
|
public:
|
|
bool isValid() const { return Order != ~0u; }
|
|
unsigned getOrder() const { return Order; }
|
|
OffloadingEntryInfoKinds getKind() const { return Kind; }
|
|
uint32_t getFlags() const { return Flags; }
|
|
void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
|
|
Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
|
|
void setAddress(Constant *V) {
|
|
assert(!Addr.pointsToAliveValue() && "Address has been set before!");
|
|
Addr = V;
|
|
}
|
|
static bool classof(const OffloadEntryInfo *Info) { return true; }
|
|
|
|
private:
|
|
/// Address of the entity that has to be mapped for offloading.
|
|
WeakTrackingVH Addr;
|
|
|
|
/// Flags associated with the device global.
|
|
uint32_t Flags = 0u;
|
|
|
|
/// Order this entry was emitted.
|
|
unsigned Order = ~0u;
|
|
|
|
OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
|
|
};
|
|
|
|
/// Return true if a there are no entries defined.
|
|
bool empty() const;
|
|
/// Return number of entries defined so far.
|
|
unsigned size() const { return OffloadingEntriesNum; }
|
|
|
|
OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
|
|
|
|
//
|
|
// Target region entries related.
|
|
//
|
|
|
|
/// Kind of the target registry entry.
|
|
enum OMPTargetRegionEntryKind : uint32_t {
|
|
/// Mark the entry as target region.
|
|
OMPTargetRegionEntryTargetRegion = 0x0,
|
|
};
|
|
|
|
/// Target region entries info.
|
|
class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
|
|
/// Address that can be used as the ID of the entry.
|
|
Constant *ID = nullptr;
|
|
|
|
public:
|
|
OffloadEntryInfoTargetRegion()
|
|
: OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
|
|
explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
|
|
Constant *ID,
|
|
OMPTargetRegionEntryKind Flags)
|
|
: OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
|
|
ID(ID) {
|
|
setAddress(Addr);
|
|
}
|
|
|
|
Constant *getID() const { return ID; }
|
|
void setID(Constant *V) {
|
|
assert(!ID && "ID has been set before!");
|
|
ID = V;
|
|
}
|
|
static bool classof(const OffloadEntryInfo *Info) {
|
|
return Info->getKind() == OffloadingEntryInfoTargetRegion;
|
|
}
|
|
};
|
|
|
|
/// Initialize target region entry.
|
|
/// This is ONLY needed for DEVICE compilation.
|
|
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
|
|
unsigned Order);
|
|
/// Register target region entry.
|
|
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
|
|
Constant *Addr, Constant *ID,
|
|
OMPTargetRegionEntryKind Flags);
|
|
/// Return true if a target region entry with the provided information
|
|
/// exists.
|
|
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
|
|
bool IgnoreAddressId = false) const;
|
|
|
|
// Return the Name based on \a EntryInfo using the next available Count.
|
|
void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
|
|
const TargetRegionEntryInfo &EntryInfo);
|
|
|
|
/// brief Applies action \a Action on all registered entries.
|
|
typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
|
|
const OffloadEntryInfoTargetRegion &)>
|
|
OffloadTargetRegionEntryInfoActTy;
|
|
void
|
|
actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
|
|
|
|
//
|
|
// Device global variable entries related.
|
|
//
|
|
|
|
/// Kind of the global variable entry..
|
|
enum OMPTargetGlobalVarEntryKind : uint32_t {
|
|
/// Mark the entry as a to declare target.
|
|
OMPTargetGlobalVarEntryTo = 0x0,
|
|
/// Mark the entry as a to declare target link.
|
|
OMPTargetGlobalVarEntryLink = 0x1,
|
|
/// Mark the entry as a declare target enter.
|
|
OMPTargetGlobalVarEntryEnter = 0x2,
|
|
/// Mark the entry as having no declare target entry kind.
|
|
OMPTargetGlobalVarEntryNone = 0x3,
|
|
/// Mark the entry as a declare target indirect global.
|
|
OMPTargetGlobalVarEntryIndirect = 0x8,
|
|
/// Mark the entry as a register requires global.
|
|
OMPTargetGlobalRegisterRequires = 0x10,
|
|
};
|
|
|
|
/// Kind of device clause for declare target variables
|
|
/// and functions
|
|
/// NOTE: Currently not used as a part of a variable entry
|
|
/// used for Flang and Clang to interface with the variable
|
|
/// related registration functions
|
|
enum OMPTargetDeviceClauseKind : uint32_t {
|
|
/// The target is marked for all devices
|
|
OMPTargetDeviceClauseAny = 0x0,
|
|
/// The target is marked for non-host devices
|
|
OMPTargetDeviceClauseNoHost = 0x1,
|
|
/// The target is marked for host devices
|
|
OMPTargetDeviceClauseHost = 0x2,
|
|
/// The target is marked as having no clause
|
|
OMPTargetDeviceClauseNone = 0x3
|
|
};
|
|
|
|
/// Device global variable entries info.
|
|
class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
|
|
/// Type of the global variable.
|
|
int64_t VarSize;
|
|
GlobalValue::LinkageTypes Linkage;
|
|
const std::string VarName;
|
|
|
|
public:
|
|
OffloadEntryInfoDeviceGlobalVar()
|
|
: OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
|
|
explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
|
|
OMPTargetGlobalVarEntryKind Flags)
|
|
: OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
|
|
explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
|
|
int64_t VarSize,
|
|
OMPTargetGlobalVarEntryKind Flags,
|
|
GlobalValue::LinkageTypes Linkage,
|
|
const std::string &VarName)
|
|
: OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
|
|
VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
|
|
setAddress(Addr);
|
|
}
|
|
|
|
int64_t getVarSize() const { return VarSize; }
|
|
StringRef getVarName() const { return VarName; }
|
|
void setVarSize(int64_t Size) { VarSize = Size; }
|
|
GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
|
|
void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
|
|
static bool classof(const OffloadEntryInfo *Info) {
|
|
return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
|
|
}
|
|
};
|
|
|
|
/// Initialize device global variable entry.
|
|
/// This is ONLY used for DEVICE compilation.
|
|
void initializeDeviceGlobalVarEntryInfo(StringRef Name,
|
|
OMPTargetGlobalVarEntryKind Flags,
|
|
unsigned Order);
|
|
|
|
/// Register device global variable entry.
|
|
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
|
|
int64_t VarSize,
|
|
OMPTargetGlobalVarEntryKind Flags,
|
|
GlobalValue::LinkageTypes Linkage);
|
|
/// Checks if the variable with the given name has been registered already.
|
|
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
|
|
return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
|
|
}
|
|
/// Applies action \a Action on all registered entries.
|
|
typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
|
|
OffloadDeviceGlobalVarEntryInfoActTy;
|
|
void actOnDeviceGlobalVarEntriesInfo(
|
|
const OffloadDeviceGlobalVarEntryInfoActTy &Action);
|
|
|
|
private:
|
|
/// Return the count of entries at a particular source location.
|
|
unsigned
|
|
getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
|
|
|
|
/// Update the count of entries at a particular source location.
|
|
void
|
|
incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
|
|
|
|
static TargetRegionEntryInfo
|
|
getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
|
|
return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
|
|
EntryInfo.FileID, EntryInfo.Line, 0);
|
|
}
|
|
|
|
// Count of entries at a location.
|
|
std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
|
|
|
|
// Storage for target region entries kind.
|
|
typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
|
|
OffloadEntriesTargetRegionTy;
|
|
OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
|
|
/// Storage for device global variable entries kind. The storage is to be
|
|
/// indexed by mangled name.
|
|
typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
|
|
OffloadEntriesDeviceGlobalVarTy;
|
|
OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
|
|
};
|
|
|
|
/// An interface to create LLVM-IR for OpenMP directives.
|
|
///
|
|
/// Each OpenMP directive has a corresponding public generator method.
|
|
class OpenMPIRBuilder {
|
|
public:
|
|
/// Create a new OpenMPIRBuilder operating on the given module \p M. This will
|
|
/// not have an effect on \p M (see initialize)
|
|
OpenMPIRBuilder(Module &M)
|
|
: M(M), Builder(M.getContext()), OffloadInfoManager(this),
|
|
T(Triple(M.getTargetTriple())) {}
|
|
~OpenMPIRBuilder();
|
|
|
|
/// Initialize the internal state, this will put structures types and
|
|
/// potentially other helpers into the underlying module. Must be called
|
|
/// before any other method and only once! This internal state includes types
|
|
/// used in the OpenMPIRBuilder generated from OMPKinds.def.
|
|
void initialize();
|
|
|
|
void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
|
|
|
|
/// Finalize the underlying module, e.g., by outlining regions.
|
|
/// \param Fn The function to be finalized. If not used,
|
|
/// all functions are finalized.
|
|
void finalize(Function *Fn = nullptr);
|
|
|
|
/// Add attributes known for \p FnID to \p Fn.
|
|
void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
|
|
|
|
/// Type used throughout for insertion points.
|
|
using InsertPointTy = IRBuilder<>::InsertPoint;
|
|
|
|
/// Get the create a name using the platform specific separators.
|
|
/// \param Parts parts of the final name that needs separation
|
|
/// The created name has a first separator between the first and second part
|
|
/// and a second separator between all other parts.
|
|
/// E.g. with FirstSeparator "$" and Separator "." and
|
|
/// parts: "p1", "p2", "p3", "p4"
|
|
/// The resulting name is "p1$p2.p3.p4"
|
|
/// The separators are retrieved from the OpenMPIRBuilderConfig.
|
|
std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
|
|
|
|
/// Callback type for variable finalization (think destructors).
|
|
///
|
|
/// \param CodeGenIP is the insertion point at which the finalization code
|
|
/// should be placed.
|
|
///
|
|
/// A finalize callback knows about all objects that need finalization, e.g.
|
|
/// destruction, when the scope of the currently generated construct is left
|
|
/// at the time, and location, the callback is invoked.
|
|
using FinalizeCallbackTy = std::function<void(InsertPointTy CodeGenIP)>;
|
|
|
|
struct FinalizationInfo {
|
|
/// The finalization callback provided by the last in-flight invocation of
|
|
/// createXXXX for the directive of kind DK.
|
|
FinalizeCallbackTy FiniCB;
|
|
|
|
/// The directive kind of the innermost directive that has an associated
|
|
/// region which might require finalization when it is left.
|
|
omp::Directive DK;
|
|
|
|
/// Flag to indicate if the directive is cancellable.
|
|
bool IsCancellable;
|
|
};
|
|
|
|
/// Push a finalization callback on the finalization stack.
|
|
///
|
|
/// NOTE: Temporary solution until Clang CG is gone.
|
|
void pushFinalizationCB(const FinalizationInfo &FI) {
|
|
FinalizationStack.push_back(FI);
|
|
}
|
|
|
|
/// Pop the last finalization callback from the finalization stack.
|
|
///
|
|
/// NOTE: Temporary solution until Clang CG is gone.
|
|
void popFinalizationCB() { FinalizationStack.pop_back(); }
|
|
|
|
/// Callback type for body (=inner region) code generation
|
|
///
|
|
/// The callback takes code locations as arguments, each describing a
|
|
/// location where additional instructions can be inserted.
|
|
///
|
|
/// The CodeGenIP may be in the middle of a basic block or point to the end of
|
|
/// it. The basic block may have a terminator or be degenerate. The callback
|
|
/// function may just insert instructions at that position, but also split the
|
|
/// block (without the Before argument of BasicBlock::splitBasicBlock such
|
|
/// that the identify of the split predecessor block is preserved) and insert
|
|
/// additional control flow, including branches that do not lead back to what
|
|
/// follows the CodeGenIP. Note that since the callback is allowed to split
|
|
/// the block, callers must assume that InsertPoints to positions in the
|
|
/// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
|
|
/// such InsertPoints need to be preserved, it can split the block itself
|
|
/// before calling the callback.
|
|
///
|
|
/// AllocaIP and CodeGenIP must not point to the same position.
|
|
///
|
|
/// \param AllocaIP is the insertion point at which new alloca instructions
|
|
/// should be placed. The BasicBlock it is pointing to must
|
|
/// not be split.
|
|
/// \param CodeGenIP is the insertion point at which the body code should be
|
|
/// placed.
|
|
using BodyGenCallbackTy =
|
|
function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
|
|
|
|
// This is created primarily for sections construct as llvm::function_ref
|
|
// (BodyGenCallbackTy) is not storable (as described in the comments of
|
|
// function_ref class - function_ref contains non-ownable reference
|
|
// to the callable.
|
|
using StorableBodyGenCallbackTy =
|
|
std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
|
|
|
|
/// Callback type for loop body code generation.
|
|
///
|
|
/// \param CodeGenIP is the insertion point where the loop's body code must be
|
|
/// placed. This will be a dedicated BasicBlock with a
|
|
/// conditional branch from the loop condition check and
|
|
/// terminated with an unconditional branch to the loop
|
|
/// latch.
|
|
/// \param IndVar is the induction variable usable at the insertion point.
|
|
using LoopBodyGenCallbackTy =
|
|
function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>;
|
|
|
|
/// Callback type for variable privatization (think copy & default
|
|
/// constructor).
|
|
///
|
|
/// \param AllocaIP is the insertion point at which new alloca instructions
|
|
/// should be placed.
|
|
/// \param CodeGenIP is the insertion point at which the privatization code
|
|
/// should be placed.
|
|
/// \param Original The value being copied/created, should not be used in the
|
|
/// generated IR.
|
|
/// \param Inner The equivalent of \p Original that should be used in the
|
|
/// generated IR; this is equal to \p Original if the value is
|
|
/// a pointer and can thus be passed directly, otherwise it is
|
|
/// an equivalent but different value.
|
|
/// \param ReplVal The replacement value, thus a copy or new created version
|
|
/// of \p Inner.
|
|
///
|
|
/// \returns The new insertion point where code generation continues and
|
|
/// \p ReplVal the replacement value.
|
|
using PrivatizeCallbackTy = function_ref<InsertPointTy(
|
|
InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
|
|
Value &Inner, Value *&ReplVal)>;
|
|
|
|
/// Description of a LLVM-IR insertion point (IP) and a debug/source location
|
|
/// (filename, line, column, ...).
|
|
struct LocationDescription {
|
|
LocationDescription(const IRBuilderBase &IRB)
|
|
: IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
|
|
LocationDescription(const InsertPointTy &IP) : IP(IP) {}
|
|
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
|
|
: IP(IP), DL(DL) {}
|
|
InsertPointTy IP;
|
|
DebugLoc DL;
|
|
};
|
|
|
|
/// Emitter methods for OpenMP directives.
|
|
///
|
|
///{
|
|
|
|
/// Generator for '#omp barrier'
|
|
///
|
|
/// \param Loc The location where the barrier directive was encountered.
|
|
/// \param Kind The kind of directive that caused the barrier.
|
|
/// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
|
|
/// \param CheckCancelFlag Flag to indicate a cancel barrier return value
|
|
/// should be checked and acted upon.
|
|
/// \param ThreadID Optional parameter to pass in any existing ThreadID value.
|
|
///
|
|
/// \returns The insertion point after the barrier.
|
|
InsertPointTy createBarrier(const LocationDescription &Loc,
|
|
omp::Directive Kind, bool ForceSimpleCall = false,
|
|
bool CheckCancelFlag = true);
|
|
|
|
/// Generator for '#omp cancel'
|
|
///
|
|
/// \param Loc The location where the directive was encountered.
|
|
/// \param IfCondition The evaluated 'if' clause expression, if any.
|
|
/// \param CanceledDirective The kind of directive that is cancled.
|
|
///
|
|
/// \returns The insertion point after the barrier.
|
|
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
|
|
omp::Directive CanceledDirective);
|
|
|
|
/// Generator for '#omp parallel'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AllocaIP The insertion points to be used for alloca instructions.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param PrivCB Callback to copy a given variable (think copy constructor).
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \param IfCondition The evaluated 'if' clause expression, if any.
|
|
/// \param NumThreads The evaluated 'num_threads' clause expression, if any.
|
|
/// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
|
|
/// \param IsCancellable Flag to indicate a cancellable parallel region.
|
|
///
|
|
/// \returns The insertion position *after* the parallel.
|
|
IRBuilder<>::InsertPoint
|
|
createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP,
|
|
BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
|
|
FinalizeCallbackTy FiniCB, Value *IfCondition,
|
|
Value *NumThreads, omp::ProcBindKind ProcBind,
|
|
bool IsCancellable);
|
|
|
|
/// Generator for the control flow structure of an OpenMP canonical loop.
|
|
///
|
|
/// This generator operates on the logical iteration space of the loop, i.e.
|
|
/// the caller only has to provide a loop trip count of the loop as defined by
|
|
/// base language semantics. The trip count is interpreted as an unsigned
|
|
/// integer. The induction variable passed to \p BodyGenCB will be of the same
|
|
/// type and run from 0 to \p TripCount - 1. It is up to the callback to
|
|
/// convert the logical iteration variable to the loop counter variable in the
|
|
/// loop body.
|
|
///
|
|
/// \param Loc The insert and source location description. The insert
|
|
/// location can be between two instructions or the end of a
|
|
/// degenerate block (e.g. a BB under construction).
|
|
/// \param BodyGenCB Callback that will generate the loop body code.
|
|
/// \param TripCount Number of iterations the loop body is executed.
|
|
/// \param Name Base name used to derive BB and instruction names.
|
|
///
|
|
/// \returns An object representing the created control flow structure which
|
|
/// can be used for loop-associated directives.
|
|
CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
|
|
LoopBodyGenCallbackTy BodyGenCB,
|
|
Value *TripCount,
|
|
const Twine &Name = "loop");
|
|
|
|
/// Generator for the control flow structure of an OpenMP canonical loop.
|
|
///
|
|
/// Instead of a logical iteration space, this allows specifying user-defined
|
|
/// loop counter values using increment, upper- and lower bounds. To
|
|
/// disambiguate the terminology when counting downwards, instead of lower
|
|
/// bounds we use \p Start for the loop counter value in the first body
|
|
/// iteration.
|
|
///
|
|
/// Consider the following limitations:
|
|
///
|
|
/// * A loop counter space over all integer values of its bit-width cannot be
|
|
/// represented. E.g using uint8_t, its loop trip count of 256 cannot be
|
|
/// stored into an 8 bit integer):
|
|
///
|
|
/// DO I = 0, 255, 1
|
|
///
|
|
/// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
|
|
/// effectively counting downwards:
|
|
///
|
|
/// for (uint8_t i = 100u; i > 0; i += 127u)
|
|
///
|
|
///
|
|
/// TODO: May need to add additional parameters to represent:
|
|
///
|
|
/// * Allow representing downcounting with unsigned integers.
|
|
///
|
|
/// * Sign of the step and the comparison operator might disagree:
|
|
///
|
|
/// for (int i = 0; i < 42; i -= 1u)
|
|
///
|
|
//
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the loop body code.
|
|
/// \param Start Value of the loop counter for the first iterations.
|
|
/// \param Stop Loop counter values past this will stop the loop.
|
|
/// \param Step Loop counter increment after each iteration; negative
|
|
/// means counting down.
|
|
/// \param IsSigned Whether Start, Stop and Step are signed integers.
|
|
/// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
|
|
/// counter.
|
|
/// \param ComputeIP Insertion point for instructions computing the trip
|
|
/// count. Can be used to ensure the trip count is available
|
|
/// at the outermost loop of a loop nest. If not set,
|
|
/// defaults to the preheader of the generated loop.
|
|
/// \param Name Base name used to derive BB and instruction names.
|
|
///
|
|
/// \returns An object representing the created control flow structure which
|
|
/// can be used for loop-associated directives.
|
|
CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
|
|
LoopBodyGenCallbackTy BodyGenCB,
|
|
Value *Start, Value *Stop, Value *Step,
|
|
bool IsSigned, bool InclusiveStop,
|
|
InsertPointTy ComputeIP = {},
|
|
const Twine &Name = "loop");
|
|
|
|
/// Collapse a loop nest into a single loop.
|
|
///
|
|
/// Merges loops of a loop nest into a single CanonicalLoopNest representation
|
|
/// that has the same number of innermost loop iterations as the origin loop
|
|
/// nest. The induction variables of the input loops are derived from the
|
|
/// collapsed loop's induction variable. This is intended to be used to
|
|
/// implement OpenMP's collapse clause. Before applying a directive,
|
|
/// collapseLoops normalizes a loop nest to contain only a single loop and the
|
|
/// directive's implementation does not need to handle multiple loops itself.
|
|
/// This does not remove the need to handle all loop nest handling by
|
|
/// directives, such as the ordered(<n>) clause or the simd schedule-clause
|
|
/// modifier of the worksharing-loop directive.
|
|
///
|
|
/// Example:
|
|
/// \code
|
|
/// for (int i = 0; i < 7; ++i) // Canonical loop "i"
|
|
/// for (int j = 0; j < 9; ++j) // Canonical loop "j"
|
|
/// body(i, j);
|
|
/// \endcode
|
|
///
|
|
/// After collapsing with Loops={i,j}, the loop is changed to
|
|
/// \code
|
|
/// for (int ij = 0; ij < 63; ++ij) {
|
|
/// int i = ij / 9;
|
|
/// int j = ij % 9;
|
|
/// body(i, j);
|
|
/// }
|
|
/// \endcode
|
|
///
|
|
/// In the current implementation, the following limitations apply:
|
|
///
|
|
/// * All input loops have an induction variable of the same type.
|
|
///
|
|
/// * The collapsed loop will have the same trip count integer type as the
|
|
/// input loops. Therefore it is possible that the collapsed loop cannot
|
|
/// represent all iterations of the input loops. For instance, assuming a
|
|
/// 32 bit integer type, and two input loops both iterating 2^16 times, the
|
|
/// theoretical trip count of the collapsed loop would be 2^32 iteration,
|
|
/// which cannot be represented in an 32-bit integer. Behavior is undefined
|
|
/// in this case.
|
|
///
|
|
/// * The trip counts of every input loop must be available at \p ComputeIP.
|
|
/// Non-rectangular loops are not yet supported.
|
|
///
|
|
/// * At each nest level, code between a surrounding loop and its nested loop
|
|
/// is hoisted into the loop body, and such code will be executed more
|
|
/// often than before collapsing (or not at all if any inner loop iteration
|
|
/// has a trip count of 0). This is permitted by the OpenMP specification.
|
|
///
|
|
/// \param DL Debug location for instructions added for collapsing,
|
|
/// such as instructions to compute/derive the input loop's
|
|
/// induction variables.
|
|
/// \param Loops Loops in the loop nest to collapse. Loops are specified
|
|
/// from outermost-to-innermost and every control flow of a
|
|
/// loop's body must pass through its directly nested loop.
|
|
/// \param ComputeIP Where additional instruction that compute the collapsed
|
|
/// trip count. If not set, defaults to before the generated
|
|
/// loop.
|
|
///
|
|
/// \returns The CanonicalLoopInfo object representing the collapsed loop.
|
|
CanonicalLoopInfo *collapseLoops(DebugLoc DL,
|
|
ArrayRef<CanonicalLoopInfo *> Loops,
|
|
InsertPointTy ComputeIP);
|
|
|
|
/// Get the default alignment value for given target
|
|
///
|
|
/// \param TargetTriple Target triple
|
|
/// \param Features StringMap which describes extra CPU features
|
|
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
|
|
const StringMap<bool> &Features);
|
|
|
|
/// Retrieve (or create if non-existent) the address of a declare
|
|
/// target variable, used in conjunction with registerTargetGlobalVariable
|
|
/// to create declare target global variables.
|
|
///
|
|
/// \param CaptureClause - enumerator corresponding to the OpenMP capture
|
|
/// clause used in conjunction with the variable being registered (link,
|
|
/// to, enter).
|
|
/// \param DeviceClause - enumerator corresponding to the OpenMP capture
|
|
/// clause used in conjunction with the variable being registered (nohost,
|
|
/// host, any)
|
|
/// \param IsDeclaration - boolean stating if the variable being registered
|
|
/// is a declaration-only and not a definition
|
|
/// \param IsExternallyVisible - boolean stating if the variable is externally
|
|
/// visible
|
|
/// \param EntryInfo - Unique entry information for the value generated
|
|
/// using getTargetEntryUniqueInfo, used to name generated pointer references
|
|
/// to the declare target variable
|
|
/// \param MangledName - the mangled name of the variable being registered
|
|
/// \param GeneratedRefs - references generated by invocations of
|
|
/// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
|
|
/// these are required by Clang for book keeping.
|
|
/// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
|
|
/// \param TargetTriple - The OpenMP device target triple we are compiling
|
|
/// for
|
|
/// \param LlvmPtrTy - The type of the variable we are generating or
|
|
/// retrieving an address for
|
|
/// \param GlobalInitializer - a lambda function which creates a constant
|
|
/// used for initializing a pointer reference to the variable in certain
|
|
/// cases. If a nullptr is passed, it will default to utilising the original
|
|
/// variable to initialize the pointer reference.
|
|
/// \param VariableLinkage - a lambda function which returns the variables
|
|
/// linkage type, if unspecified and a nullptr is given, it will instead
|
|
/// utilise the linkage stored on the existing global variable in the
|
|
/// LLVMModule.
|
|
Constant *getAddrOfDeclareTargetVar(
|
|
OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
|
|
OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
|
|
bool IsDeclaration, bool IsExternallyVisible,
|
|
TargetRegionEntryInfo EntryInfo, StringRef MangledName,
|
|
std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
|
|
std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
|
|
std::function<Constant *()> GlobalInitializer,
|
|
std::function<GlobalValue::LinkageTypes()> VariableLinkage);
|
|
|
|
/// Registers a target variable for device or host.
|
|
///
|
|
/// \param CaptureClause - enumerator corresponding to the OpenMP capture
|
|
/// clause used in conjunction with the variable being registered (link,
|
|
/// to, enter).
|
|
/// \param DeviceClause - enumerator corresponding to the OpenMP capture
|
|
/// clause used in conjunction with the variable being registered (nohost,
|
|
/// host, any)
|
|
/// \param IsDeclaration - boolean stating if the variable being registered
|
|
/// is a declaration-only and not a definition
|
|
/// \param IsExternallyVisible - boolean stating if the variable is externally
|
|
/// visible
|
|
/// \param EntryInfo - Unique entry information for the value generated
|
|
/// using getTargetEntryUniqueInfo, used to name generated pointer references
|
|
/// to the declare target variable
|
|
/// \param MangledName - the mangled name of the variable being registered
|
|
/// \param GeneratedRefs - references generated by invocations of
|
|
/// registerTargetGlobalVariable these are required by Clang for book
|
|
/// keeping.
|
|
/// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
|
|
/// \param TargetTriple - The OpenMP device target triple we are compiling
|
|
/// for
|
|
/// \param GlobalInitializer - a lambda function which creates a constant
|
|
/// used for initializing a pointer reference to the variable in certain
|
|
/// cases. If a nullptr is passed, it will default to utilising the original
|
|
/// variable to initialize the pointer reference.
|
|
/// \param VariableLinkage - a lambda function which returns the variables
|
|
/// linkage type, if unspecified and a nullptr is given, it will instead
|
|
/// utilise the linkage stored on the existing global variable in the
|
|
/// LLVMModule.
|
|
/// \param LlvmPtrTy - The type of the variable we are generating or
|
|
/// retrieving an address for
|
|
/// \param Addr - the original llvm value (addr) of the variable to be
|
|
/// registered
|
|
void registerTargetGlobalVariable(
|
|
OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
|
|
OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
|
|
bool IsDeclaration, bool IsExternallyVisible,
|
|
TargetRegionEntryInfo EntryInfo, StringRef MangledName,
|
|
std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
|
|
std::vector<Triple> TargetTriple,
|
|
std::function<Constant *()> GlobalInitializer,
|
|
std::function<GlobalValue::LinkageTypes()> VariableLinkage,
|
|
Type *LlvmPtrTy, Constant *Addr);
|
|
|
|
/// Get the offset of the OMP_MAP_MEMBER_OF field.
|
|
unsigned getFlagMemberOffset();
|
|
|
|
/// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
|
|
/// the position given.
|
|
/// \param Position - A value indicating the position of the parent
|
|
/// of the member in the kernel argument structure, often retrieved
|
|
/// by the parents position in the combined information vectors used
|
|
/// to generate the structure itself. Multiple children (member's of)
|
|
/// with the same parent will use the same returned member flag.
|
|
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
|
|
|
|
/// Given an initial flag set, this function modifies it to contain
|
|
/// the passed in MemberOfFlag generated from the getMemberOfFlag
|
|
/// function. The results are dependent on the existing flag bits
|
|
/// set in the original flag set.
|
|
/// \param Flags - The original set of flags to be modified with the
|
|
/// passed in MemberOfFlag.
|
|
/// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
|
|
/// slightly based on the getMemberOfFlag which adjusts the flag bits
|
|
/// based on the members position in its parent.
|
|
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
|
|
omp::OpenMPOffloadMappingFlags MemberOfFlag);
|
|
|
|
private:
|
|
/// Modifies the canonical loop to be a statically-scheduled workshare loop
|
|
/// which is executed on the device
|
|
///
|
|
/// This takes a \p CLI representing a canonical loop, such as the one
|
|
/// created by \see createCanonicalLoop and emits additional instructions to
|
|
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
|
/// runtime function in the preheader to call OpenMP device rtl function
|
|
/// which handles worksharing of loop body interations.
|
|
///
|
|
/// \param DL Debug location for instructions added for the
|
|
/// workshare-loop construct itself.
|
|
/// \param CLI A descriptor of the canonical loop to workshare.
|
|
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
|
/// preheader of the loop.
|
|
/// \param LoopType Information about type of loop worksharing.
|
|
/// It corresponds to type of loop workshare OpenMP pragma.
|
|
///
|
|
/// \returns Point where to insert code after the workshare construct.
|
|
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
|
|
InsertPointTy AllocaIP,
|
|
omp::WorksharingLoopType LoopType);
|
|
|
|
/// Modifies the canonical loop to be a statically-scheduled workshare loop.
|
|
///
|
|
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
|
/// created by \p createCanonicalLoop and emits additional instructions to
|
|
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
|
/// runtime function in the preheader to obtain the loop bounds to be used in
|
|
/// the current thread, updates the relevant instructions in the canonical
|
|
/// loop and calls to an OpenMP runtime finalization function after the loop.
|
|
///
|
|
/// \param DL Debug location for instructions added for the
|
|
/// workshare-loop construct itself.
|
|
/// \param CLI A descriptor of the canonical loop to workshare.
|
|
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
|
/// preheader of the loop.
|
|
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
|
|
/// the loop.
|
|
///
|
|
/// \returns Point where to insert code after the workshare construct.
|
|
InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
|
|
InsertPointTy AllocaIP,
|
|
bool NeedsBarrier);
|
|
|
|
/// Modifies the canonical loop a statically-scheduled workshare loop with a
|
|
/// user-specified chunk size.
|
|
///
|
|
/// \param DL Debug location for instructions added for the
|
|
/// workshare-loop construct itself.
|
|
/// \param CLI A descriptor of the canonical loop to workshare.
|
|
/// \param AllocaIP An insertion point for Alloca instructions usable in
|
|
/// the preheader of the loop.
|
|
/// \param NeedsBarrier Indicates whether a barrier must be inserted after the
|
|
/// loop.
|
|
/// \param ChunkSize The user-specified chunk size.
|
|
///
|
|
/// \returns Point where to insert code after the workshare construct.
|
|
InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
|
|
CanonicalLoopInfo *CLI,
|
|
InsertPointTy AllocaIP,
|
|
bool NeedsBarrier,
|
|
Value *ChunkSize);
|
|
|
|
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
|
|
///
|
|
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
|
/// created by \p createCanonicalLoop and emits additional instructions to
|
|
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
|
/// runtime function in the preheader to obtain, and then in each iteration
|
|
/// to update the loop counter.
|
|
///
|
|
/// \param DL Debug location for instructions added for the
|
|
/// workshare-loop construct itself.
|
|
/// \param CLI A descriptor of the canonical loop to workshare.
|
|
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
|
/// preheader of the loop.
|
|
/// \param SchedType Type of scheduling to be passed to the init function.
|
|
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
|
|
/// the loop.
|
|
/// \param Chunk The size of loop chunk considered as a unit when
|
|
/// scheduling. If \p nullptr, defaults to 1.
|
|
///
|
|
/// \returns Point where to insert code after the workshare construct.
|
|
InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
|
|
InsertPointTy AllocaIP,
|
|
omp::OMPScheduleType SchedType,
|
|
bool NeedsBarrier,
|
|
Value *Chunk = nullptr);
|
|
|
|
/// Create alternative version of the loop to support if clause
|
|
///
|
|
/// OpenMP if clause can require to generate second loop. This loop
|
|
/// will be executed when if clause condition is not met. createIfVersion
|
|
/// adds branch instruction to the copied loop if \p ifCond is not met.
|
|
///
|
|
/// \param Loop Original loop which should be versioned.
|
|
/// \param IfCond Value which corresponds to if clause condition
|
|
/// \param VMap Value to value map to define relation between
|
|
/// original and copied loop values and loop blocks.
|
|
/// \param NamePrefix Optional name prefix for if.then if.else blocks.
|
|
void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
|
|
ValueToValueMapTy &VMap, const Twine &NamePrefix = "");
|
|
|
|
public:
|
|
/// Modifies the canonical loop to be a workshare loop.
|
|
///
|
|
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
|
/// created by \p createCanonicalLoop and emits additional instructions to
|
|
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
|
/// runtime function in the preheader to obtain the loop bounds to be used in
|
|
/// the current thread, updates the relevant instructions in the canonical
|
|
/// loop and calls to an OpenMP runtime finalization function after the loop.
|
|
///
|
|
/// The concrete transformation is done by applyStaticWorkshareLoop,
|
|
/// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
|
|
/// on the value of \p SchedKind and \p ChunkSize.
|
|
///
|
|
/// \param DL Debug location for instructions added for the
|
|
/// workshare-loop construct itself.
|
|
/// \param CLI A descriptor of the canonical loop to workshare.
|
|
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
|
/// preheader of the loop.
|
|
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
|
|
/// the loop.
|
|
/// \param SchedKind Scheduling algorithm to use.
|
|
/// \param ChunkSize The chunk size for the inner loop.
|
|
/// \param HasSimdModifier Whether the simd modifier is present in the
|
|
/// schedule clause.
|
|
/// \param HasMonotonicModifier Whether the monotonic modifier is present in
|
|
/// the schedule clause.
|
|
/// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
|
|
/// present in the schedule clause.
|
|
/// \param HasOrderedClause Whether the (parameterless) ordered clause is
|
|
/// present.
|
|
/// \param LoopType Information about type of loop worksharing.
|
|
/// It corresponds to type of loop workshare OpenMP pragma.
|
|
///
|
|
/// \returns Point where to insert code after the workshare construct.
|
|
InsertPointTy applyWorkshareLoop(
|
|
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
|
|
bool NeedsBarrier,
|
|
llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
|
|
Value *ChunkSize = nullptr, bool HasSimdModifier = false,
|
|
bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
|
|
bool HasOrderedClause = false,
|
|
omp::WorksharingLoopType LoopType =
|
|
omp::WorksharingLoopType::ForStaticLoop);
|
|
|
|
/// Tile a loop nest.
|
|
///
|
|
/// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
|
|
/// \p/ Loops must be perfectly nested, from outermost to innermost loop
|
|
/// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
|
|
/// of every loop and every tile sizes must be usable in the outermost
|
|
/// loop's preheader. This implies that the loop nest is rectangular.
|
|
///
|
|
/// Example:
|
|
/// \code
|
|
/// for (int i = 0; i < 15; ++i) // Canonical loop "i"
|
|
/// for (int j = 0; j < 14; ++j) // Canonical loop "j"
|
|
/// body(i, j);
|
|
/// \endcode
|
|
///
|
|
/// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
|
|
/// \code
|
|
/// for (int i1 = 0; i1 < 3; ++i1)
|
|
/// for (int j1 = 0; j1 < 2; ++j1)
|
|
/// for (int i2 = 0; i2 < 5; ++i2)
|
|
/// for (int j2 = 0; j2 < 7; ++j2)
|
|
/// body(i1*3+i2, j1*3+j2);
|
|
/// \endcode
|
|
///
|
|
/// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
|
|
/// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
|
|
/// handles non-constant trip counts, non-constant tile sizes and trip counts
|
|
/// that are not multiples of the tile size. In the latter case the tile loop
|
|
/// of the last floor-loop iteration will have fewer iterations than specified
|
|
/// as its tile size.
|
|
///
|
|
///
|
|
/// @param DL Debug location for instructions added by tiling, for
|
|
/// instance the floor- and tile trip count computation.
|
|
/// @param Loops Loops to tile. The CanonicalLoopInfo objects are
|
|
/// invalidated by this method, i.e. should not used after
|
|
/// tiling.
|
|
/// @param TileSizes For each loop in \p Loops, the tile size for that
|
|
/// dimensions.
|
|
///
|
|
/// \returns A list of generated loops. Contains twice as many loops as the
|
|
/// input loop nest; the first half are the floor loops and the
|
|
/// second half are the tile loops.
|
|
std::vector<CanonicalLoopInfo *>
|
|
tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
|
|
ArrayRef<Value *> TileSizes);
|
|
|
|
/// Fully unroll a loop.
|
|
///
|
|
/// Instead of unrolling the loop immediately (and duplicating its body
|
|
/// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
|
|
/// metadata.
|
|
///
|
|
/// \param DL Debug location for instructions added by unrolling.
|
|
/// \param Loop The loop to unroll. The loop will be invalidated.
|
|
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
|
|
|
|
/// Fully or partially unroll a loop. How the loop is unrolled is determined
|
|
/// using LLVM's LoopUnrollPass.
|
|
///
|
|
/// \param DL Debug location for instructions added by unrolling.
|
|
/// \param Loop The loop to unroll. The loop will be invalidated.
|
|
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
|
|
|
|
/// Partially unroll a loop.
|
|
///
|
|
/// The CanonicalLoopInfo of the unrolled loop for use with chained
|
|
/// loop-associated directive can be requested using \p UnrolledCLI. Not
|
|
/// needing the CanonicalLoopInfo allows more efficient code generation by
|
|
/// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
|
|
/// A loop-associated directive applied to the unrolled loop needs to know the
|
|
/// new trip count which means that if using a heuristically determined unroll
|
|
/// factor (\p Factor == 0), that factor must be computed immediately. We are
|
|
/// using the same logic as the LoopUnrollPass to derived the unroll factor,
|
|
/// but which assumes that some canonicalization has taken place (e.g.
|
|
/// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
|
|
/// better when the unrolled loop's CanonicalLoopInfo is not needed.
|
|
///
|
|
/// \param DL Debug location for instructions added by unrolling.
|
|
/// \param Loop The loop to unroll. The loop will be invalidated.
|
|
/// \param Factor The factor to unroll the loop by. A factor of 0
|
|
/// indicates that a heuristic should be used to determine
|
|
/// the unroll-factor.
|
|
/// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
|
|
/// partially unrolled loop. Otherwise, uses loop metadata
|
|
/// to defer unrolling to the LoopUnrollPass.
|
|
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
|
|
CanonicalLoopInfo **UnrolledCLI);
|
|
|
|
/// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
|
|
/// is cloned. The metadata which prevents vectorization is added to
|
|
/// to the cloned loop. The cloned loop is executed when ifCond is evaluated
|
|
/// to false.
|
|
///
|
|
/// \param Loop The loop to simd-ize.
|
|
/// \param AlignedVars The map which containts pairs of the pointer
|
|
/// and its corresponding alignment.
|
|
/// \param IfCond The value which corresponds to the if clause
|
|
/// condition.
|
|
/// \param Order The enum to map order clause.
|
|
/// \param Simdlen The Simdlen length to apply to the simd loop.
|
|
/// \param Safelen The Safelen length to apply to the simd loop.
|
|
void applySimd(CanonicalLoopInfo *Loop,
|
|
MapVector<Value *, Value *> AlignedVars, Value *IfCond,
|
|
omp::OrderKind Order, ConstantInt *Simdlen,
|
|
ConstantInt *Safelen);
|
|
|
|
/// Generator for '#omp flush'
|
|
///
|
|
/// \param Loc The location where the flush directive was encountered
|
|
void createFlush(const LocationDescription &Loc);
|
|
|
|
/// Generator for '#omp taskwait'
|
|
///
|
|
/// \param Loc The location where the taskwait directive was encountered.
|
|
void createTaskwait(const LocationDescription &Loc);
|
|
|
|
/// Generator for '#omp taskyield'
|
|
///
|
|
/// \param Loc The location where the taskyield directive was encountered.
|
|
void createTaskyield(const LocationDescription &Loc);
|
|
|
|
/// A struct to pack the relevant information for an OpenMP depend clause.
|
|
struct DependData {
|
|
omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
|
|
Type *DepValueType;
|
|
Value *DepVal;
|
|
explicit DependData() = default;
|
|
DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
|
|
Value *DepVal)
|
|
: DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
|
|
};
|
|
|
|
/// Generator for `#omp task`
|
|
///
|
|
/// \param Loc The location where the task construct was encountered.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param Tied True if the task is tied, false if the task is untied.
|
|
/// \param Final i1 value which is `true` if the task is final, `false` if the
|
|
/// task is not final.
|
|
/// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
|
|
/// task is generated, and the encountering thread must
|
|
/// suspend the current task region, for which execution
|
|
/// cannot be resumed until execution of the structured
|
|
/// block that is associated with the generated task is
|
|
/// completed.
|
|
InsertPointTy createTask(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
|
|
bool Tied = true, Value *Final = nullptr,
|
|
Value *IfCondition = nullptr,
|
|
SmallVector<DependData> Dependencies = {});
|
|
|
|
/// Generator for the taskgroup construct
|
|
///
|
|
/// \param Loc The location where the taskgroup construct was encountered.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
InsertPointTy createTaskgroup(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP,
|
|
BodyGenCallbackTy BodyGenCB);
|
|
|
|
using FileIdentifierInfoCallbackTy =
|
|
std::function<std::tuple<std::string, uint64_t>()>;
|
|
|
|
/// Creates a unique info for a target entry when provided a filename and
|
|
/// line number from.
|
|
///
|
|
/// \param CallBack A callback function which should return filename the entry
|
|
/// resides in as well as the line number for the target entry
|
|
/// \param ParentName The name of the parent the target entry resides in, if
|
|
/// any.
|
|
static TargetRegionEntryInfo
|
|
getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
|
|
StringRef ParentName = "");
|
|
|
|
/// Enum class for the RedctionGen CallBack type to be used.
|
|
enum class ReductionGenCBKind { Clang, MLIR };
|
|
|
|
/// ReductionGen CallBack for Clang
|
|
///
|
|
/// \param CodeGenIP InsertPoint for CodeGen.
|
|
/// \param Index Index of the ReductionInfo to generate code for.
|
|
/// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
|
|
/// codegen, used for fixup later.
|
|
/// \param RHSPtr Optionally used by Clang to
|
|
/// return the RHSPtr it used for codegen, used for fixup later.
|
|
/// \param CurFn Optionally used by Clang to pass in the Current Function as
|
|
/// Clang context may be old.
|
|
using ReductionGenClangCBTy =
|
|
std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
|
|
Value **LHS, Value **RHS, Function *CurFn)>;
|
|
|
|
/// ReductionGen CallBack for MLIR
|
|
///
|
|
/// \param CodeGenIP InsertPoint for CodeGen.
|
|
/// \param LHS Pass in the LHS Value to be used for CodeGen.
|
|
/// \param RHS Pass in the RHS Value to be used for CodeGen.
|
|
using ReductionGenCBTy = std::function<InsertPointTy(
|
|
InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
|
|
|
|
/// Functions used to generate atomic reductions. Such functions take two
|
|
/// Values representing pointers to LHS and RHS of the reduction, as well as
|
|
/// the element type of these pointers. They are expected to atomically
|
|
/// update the LHS to the reduced value.
|
|
using ReductionGenAtomicCBTy =
|
|
std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
|
|
|
|
/// Enum class for reduction evaluation types scalar, complex and aggregate.
|
|
enum class EvalKind { Scalar, Complex, Aggregate };
|
|
|
|
/// Information about an OpenMP reduction.
|
|
struct ReductionInfo {
|
|
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
|
|
EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
|
|
ReductionGenClangCBTy ReductionGenClang,
|
|
ReductionGenAtomicCBTy AtomicReductionGen)
|
|
: ElementType(ElementType), Variable(Variable),
|
|
PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
|
|
ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
|
|
AtomicReductionGen(AtomicReductionGen) {}
|
|
ReductionInfo(Value *PrivateVariable)
|
|
: ElementType(nullptr), Variable(nullptr),
|
|
PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
|
|
ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
|
|
|
|
/// Reduction element type, must match pointee type of variable.
|
|
Type *ElementType;
|
|
|
|
/// Reduction variable of pointer type.
|
|
Value *Variable;
|
|
|
|
/// Thread-private partial reduction variable.
|
|
Value *PrivateVariable;
|
|
|
|
/// Reduction evaluation kind - scalar, complex or aggregate.
|
|
EvalKind EvaluationKind;
|
|
|
|
/// Callback for generating the reduction body. The IR produced by this will
|
|
/// be used to combine two values in a thread-safe context, e.g., under
|
|
/// lock or within the same thread, and therefore need not be atomic.
|
|
ReductionGenCBTy ReductionGen;
|
|
|
|
/// Clang callback for generating the reduction body. The IR produced by
|
|
/// this will be used to combine two values in a thread-safe context, e.g.,
|
|
/// under lock or within the same thread, and therefore need not be atomic.
|
|
ReductionGenClangCBTy ReductionGenClang;
|
|
|
|
/// Callback for generating the atomic reduction body, may be null. The IR
|
|
/// produced by this will be used to atomically combine two values during
|
|
/// reduction. If null, the implementation will use the non-atomic version
|
|
/// along with the appropriate synchronization mechanisms.
|
|
ReductionGenAtomicCBTy AtomicReductionGen;
|
|
};
|
|
|
|
enum class CopyAction : unsigned {
|
|
// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
|
|
// the warp using shuffle instructions.
|
|
RemoteLaneToThread,
|
|
// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
|
|
ThreadCopy,
|
|
};
|
|
|
|
struct CopyOptionsTy {
|
|
Value *RemoteLaneOffset = nullptr;
|
|
Value *ScratchpadIndex = nullptr;
|
|
Value *ScratchpadWidth = nullptr;
|
|
};
|
|
|
|
/// Supporting functions for Reductions CodeGen.
|
|
private:
|
|
/// Emit the llvm.used metadata.
|
|
void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
|
|
|
|
/// Get the id of the current thread on the GPU.
|
|
Value *getGPUThreadID();
|
|
|
|
/// Get the GPU warp size.
|
|
Value *getGPUWarpSize();
|
|
|
|
/// Get the id of the warp in the block.
|
|
/// We assume that the warp size is 32, which is always the case
|
|
/// on the NVPTX device, to generate more efficient code.
|
|
Value *getNVPTXWarpID();
|
|
|
|
/// Get the id of the current lane in the Warp.
|
|
/// We assume that the warp size is 32, which is always the case
|
|
/// on the NVPTX device, to generate more efficient code.
|
|
Value *getNVPTXLaneID();
|
|
|
|
/// Cast value to the specified type.
|
|
Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
|
|
|
|
/// This function creates calls to one of two shuffle functions to copy
|
|
/// variables between lanes in a warp.
|
|
Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
|
|
Type *ElementType, Value *Offset);
|
|
|
|
/// Function to shuffle over the value from the remote lane.
|
|
void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
|
|
Type *ElementType, Value *Offset,
|
|
Type *ReductionArrayTy);
|
|
|
|
/// Emit instructions to copy a Reduce list, which contains partially
|
|
/// aggregated values, in the specified direction.
|
|
void emitReductionListCopy(
|
|
InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
|
|
ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
|
|
CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
|
|
|
|
/// Emit a helper that reduces data across two OpenMP threads (lanes)
|
|
/// in the same warp. It uses shuffle instructions to copy over data from
|
|
/// a remote lane's stack. The reduction algorithm performed is specified
|
|
/// by the fourth parameter.
|
|
///
|
|
/// Algorithm Versions.
|
|
/// Full Warp Reduce (argument value 0):
|
|
/// This algorithm assumes that all 32 lanes are active and gathers
|
|
/// data from these 32 lanes, producing a single resultant value.
|
|
/// Contiguous Partial Warp Reduce (argument value 1):
|
|
/// This algorithm assumes that only a *contiguous* subset of lanes
|
|
/// are active. This happens for the last warp in a parallel region
|
|
/// when the user specified num_threads is not an integer multiple of
|
|
/// 32. This contiguous subset always starts with the zeroth lane.
|
|
/// Partial Warp Reduce (argument value 2):
|
|
/// This algorithm gathers data from any number of lanes at any position.
|
|
/// All reduced values are stored in the lowest possible lane. The set
|
|
/// of problems every algorithm addresses is a super set of those
|
|
/// addressable by algorithms with a lower version number. Overhead
|
|
/// increases as algorithm version increases.
|
|
///
|
|
/// Terminology
|
|
/// Reduce element:
|
|
/// Reduce element refers to the individual data field with primitive
|
|
/// data types to be combined and reduced across threads.
|
|
/// Reduce list:
|
|
/// Reduce list refers to a collection of local, thread-private
|
|
/// reduce elements.
|
|
/// Remote Reduce list:
|
|
/// Remote Reduce list refers to a collection of remote (relative to
|
|
/// the current thread) reduce elements.
|
|
///
|
|
/// We distinguish between three states of threads that are important to
|
|
/// the implementation of this function.
|
|
/// Alive threads:
|
|
/// Threads in a warp executing the SIMT instruction, as distinguished from
|
|
/// threads that are inactive due to divergent control flow.
|
|
/// Active threads:
|
|
/// The minimal set of threads that has to be alive upon entry to this
|
|
/// function. The computation is correct iff active threads are alive.
|
|
/// Some threads are alive but they are not active because they do not
|
|
/// contribute to the computation in any useful manner. Turning them off
|
|
/// may introduce control flow overheads without any tangible benefits.
|
|
/// Effective threads:
|
|
/// In order to comply with the argument requirements of the shuffle
|
|
/// function, we must keep all lanes holding data alive. But at most
|
|
/// half of them perform value aggregation; we refer to this half of
|
|
/// threads as effective. The other half is simply handing off their
|
|
/// data.
|
|
///
|
|
/// Procedure
|
|
/// Value shuffle:
|
|
/// In this step active threads transfer data from higher lane positions
|
|
/// in the warp to lower lane positions, creating Remote Reduce list.
|
|
/// Value aggregation:
|
|
/// In this step, effective threads combine their thread local Reduce list
|
|
/// with Remote Reduce list and store the result in the thread local
|
|
/// Reduce list.
|
|
/// Value copy:
|
|
/// In this step, we deal with the assumption made by algorithm 2
|
|
/// (i.e. contiguity assumption). When we have an odd number of lanes
|
|
/// active, say 2k+1, only k threads will be effective and therefore k
|
|
/// new values will be produced. However, the Reduce list owned by the
|
|
/// (2k+1)th thread is ignored in the value aggregation. Therefore
|
|
/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
|
|
/// that the contiguity assumption still holds.
|
|
///
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReduceFn The reduction function.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The ShuffleAndReduce function.
|
|
Function *emitShuffleAndReduceFunction(
|
|
ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
|
|
Function *ReduceFn, AttributeList FuncAttrs);
|
|
|
|
/// This function emits a helper that gathers Reduce lists from the first
|
|
/// lane of every active warp to lanes in the first warp.
|
|
///
|
|
/// void inter_warp_copy_func(void* reduce_data, num_warps)
|
|
/// shared smem[warp_size];
|
|
/// For all data entries D in reduce_data:
|
|
/// sync
|
|
/// If (I am the first lane in each warp)
|
|
/// Copy my local D to smem[warp_id]
|
|
/// sync
|
|
/// if (I am the first warp)
|
|
/// Copy smem[thread_id] to my local D
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The InterWarpCopy function.
|
|
Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
|
|
ArrayRef<ReductionInfo> ReductionInfos,
|
|
AttributeList FuncAttrs);
|
|
|
|
/// This function emits a helper that copies all the reduction variables from
|
|
/// the team into the provided global buffer for the reduction variables.
|
|
///
|
|
/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
|
|
/// For all data entries D in reduce_data:
|
|
/// Copy local D to buffer.D[Idx]
|
|
///
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReductionsBufferTy The StructTy for the reductions buffer.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The ListToGlobalCopy function.
|
|
Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
|
|
Type *ReductionsBufferTy,
|
|
AttributeList FuncAttrs);
|
|
|
|
/// This function emits a helper that copies all the reduction variables from
|
|
/// the team into the provided global buffer for the reduction variables.
|
|
///
|
|
/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
|
|
/// For all data entries D in reduce_data:
|
|
/// Copy buffer.D[Idx] to local D;
|
|
///
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReductionsBufferTy The StructTy for the reductions buffer.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The GlobalToList function.
|
|
Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
|
|
Type *ReductionsBufferTy,
|
|
AttributeList FuncAttrs);
|
|
|
|
/// This function emits a helper that reduces all the reduction variables from
|
|
/// the team into the provided global buffer for the reduction variables.
|
|
///
|
|
/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
|
|
/// void *GlobPtrs[];
|
|
/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
|
|
/// ...
|
|
/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
|
|
/// reduce_function(GlobPtrs, reduce_data);
|
|
///
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReduceFn The reduction function.
|
|
/// \param ReductionsBufferTy The StructTy for the reductions buffer.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The ListToGlobalReduce function.
|
|
Function *
|
|
emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
|
|
Function *ReduceFn, Type *ReductionsBufferTy,
|
|
AttributeList FuncAttrs);
|
|
|
|
/// This function emits a helper that reduces all the reduction variables from
|
|
/// the team into the provided global buffer for the reduction variables.
|
|
///
|
|
/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
|
|
/// void *GlobPtrs[];
|
|
/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
|
|
/// ...
|
|
/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
|
|
/// reduce_function(reduce_data, GlobPtrs);
|
|
///
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReduceFn The reduction function.
|
|
/// \param ReductionsBufferTy The StructTy for the reductions buffer.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The GlobalToListReduce function.
|
|
Function *
|
|
emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
|
|
Function *ReduceFn, Type *ReductionsBufferTy,
|
|
AttributeList FuncAttrs);
|
|
|
|
/// Get the function name of a reduction function.
|
|
std::string getReductionFuncName(StringRef Name) const;
|
|
|
|
/// Emits reduction function.
|
|
/// \param ReducerName Name of the function calling the reduction.
|
|
/// \param ReductionInfos Array type containing the ReductionOps.
|
|
/// \param ReductionGenCBKind Optional param to specify Clang or MLIR
|
|
/// CodeGenCB kind.
|
|
/// \param FuncAttrs Optional param to specify any function attributes that
|
|
/// need to be copied to the new function.
|
|
///
|
|
/// \return The reduction function.
|
|
Function *createReductionFunction(
|
|
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
|
|
ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
|
|
AttributeList FuncAttrs = {});
|
|
|
|
public:
|
|
///
|
|
/// Design of OpenMP reductions on the GPU
|
|
///
|
|
/// Consider a typical OpenMP program with one or more reduction
|
|
/// clauses:
|
|
///
|
|
/// float foo;
|
|
/// double bar;
|
|
/// #pragma omp target teams distribute parallel for \
|
|
/// reduction(+:foo) reduction(*:bar)
|
|
/// for (int i = 0; i < N; i++) {
|
|
/// foo += A[i]; bar *= B[i];
|
|
/// }
|
|
///
|
|
/// where 'foo' and 'bar' are reduced across all OpenMP threads in
|
|
/// all teams. In our OpenMP implementation on the NVPTX device an
|
|
/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
|
|
/// within a team are mapped to CUDA threads within a threadblock.
|
|
/// Our goal is to efficiently aggregate values across all OpenMP
|
|
/// threads such that:
|
|
///
|
|
/// - the compiler and runtime are logically concise, and
|
|
/// - the reduction is performed efficiently in a hierarchical
|
|
/// manner as follows: within OpenMP threads in the same warp,
|
|
/// across warps in a threadblock, and finally across teams on
|
|
/// the NVPTX device.
|
|
///
|
|
/// Introduction to Decoupling
|
|
///
|
|
/// We would like to decouple the compiler and the runtime so that the
|
|
/// latter is ignorant of the reduction variables (number, data types)
|
|
/// and the reduction operators. This allows a simpler interface
|
|
/// and implementation while still attaining good performance.
|
|
///
|
|
/// Pseudocode for the aforementioned OpenMP program generated by the
|
|
/// compiler is as follows:
|
|
///
|
|
/// 1. Create private copies of reduction variables on each OpenMP
|
|
/// thread: 'foo_private', 'bar_private'
|
|
/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
|
|
/// to it and writes the result in 'foo_private' and 'bar_private'
|
|
/// respectively.
|
|
/// 3. Call the OpenMP runtime on the GPU to reduce within a team
|
|
/// and store the result on the team master:
|
|
///
|
|
/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
|
|
/// reduceData, shuffleReduceFn, interWarpCpyFn)
|
|
///
|
|
/// where:
|
|
/// struct ReduceData {
|
|
/// double *foo;
|
|
/// double *bar;
|
|
/// } reduceData
|
|
/// reduceData.foo = &foo_private
|
|
/// reduceData.bar = &bar_private
|
|
///
|
|
/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
|
|
/// auxiliary functions generated by the compiler that operate on
|
|
/// variables of type 'ReduceData'. They aid the runtime perform
|
|
/// algorithmic steps in a data agnostic manner.
|
|
///
|
|
/// 'shuffleReduceFn' is a pointer to a function that reduces data
|
|
/// of type 'ReduceData' across two OpenMP threads (lanes) in the
|
|
/// same warp. It takes the following arguments as input:
|
|
///
|
|
/// a. variable of type 'ReduceData' on the calling lane,
|
|
/// b. its lane_id,
|
|
/// c. an offset relative to the current lane_id to generate a
|
|
/// remote_lane_id. The remote lane contains the second
|
|
/// variable of type 'ReduceData' that is to be reduced.
|
|
/// d. an algorithm version parameter determining which reduction
|
|
/// algorithm to use.
|
|
///
|
|
/// 'shuffleReduceFn' retrieves data from the remote lane using
|
|
/// efficient GPU shuffle intrinsics and reduces, using the
|
|
/// algorithm specified by the 4th parameter, the two operands
|
|
/// element-wise. The result is written to the first operand.
|
|
///
|
|
/// Different reduction algorithms are implemented in different
|
|
/// runtime functions, all calling 'shuffleReduceFn' to perform
|
|
/// the essential reduction step. Therefore, based on the 4th
|
|
/// parameter, this function behaves slightly differently to
|
|
/// cooperate with the runtime to ensure correctness under
|
|
/// different circumstances.
|
|
///
|
|
/// 'InterWarpCpyFn' is a pointer to a function that transfers
|
|
/// reduced variables across warps. It tunnels, through CUDA
|
|
/// shared memory, the thread-private data of type 'ReduceData'
|
|
/// from lane 0 of each warp to a lane in the first warp.
|
|
/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
|
|
/// The last team writes the global reduced value to memory.
|
|
///
|
|
/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
|
|
/// reduceData, shuffleReduceFn, interWarpCpyFn,
|
|
/// scratchpadCopyFn, loadAndReduceFn)
|
|
///
|
|
/// 'scratchpadCopyFn' is a helper that stores reduced
|
|
/// data from the team master to a scratchpad array in
|
|
/// global memory.
|
|
///
|
|
/// 'loadAndReduceFn' is a helper that loads data from
|
|
/// the scratchpad array and reduces it with the input
|
|
/// operand.
|
|
///
|
|
/// These compiler generated functions hide address
|
|
/// calculation and alignment information from the runtime.
|
|
/// 5. if ret == 1:
|
|
/// The team master of the last team stores the reduced
|
|
/// result to the globals in memory.
|
|
/// foo += reduceData.foo; bar *= reduceData.bar
|
|
///
|
|
///
|
|
/// Warp Reduction Algorithms
|
|
///
|
|
/// On the warp level, we have three algorithms implemented in the
|
|
/// OpenMP runtime depending on the number of active lanes:
|
|
///
|
|
/// Full Warp Reduction
|
|
///
|
|
/// The reduce algorithm within a warp where all lanes are active
|
|
/// is implemented in the runtime as follows:
|
|
///
|
|
/// full_warp_reduce(void *reduce_data,
|
|
/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
|
/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
|
|
/// ShuffleReduceFn(reduce_data, 0, offset, 0);
|
|
/// }
|
|
///
|
|
/// The algorithm completes in log(2, WARPSIZE) steps.
|
|
///
|
|
/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
|
|
/// not used therefore we save instructions by not retrieving lane_id
|
|
/// from the corresponding special registers. The 4th parameter, which
|
|
/// represents the version of the algorithm being used, is set to 0 to
|
|
/// signify full warp reduction.
|
|
///
|
|
/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
|
|
///
|
|
/// #reduce_elem refers to an element in the local lane's data structure
|
|
/// #remote_elem is retrieved from a remote lane
|
|
/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
|
|
/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
|
|
///
|
|
/// Contiguous Partial Warp Reduction
|
|
///
|
|
/// This reduce algorithm is used within a warp where only the first
|
|
/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
|
|
/// number of OpenMP threads in a parallel region is not a multiple of
|
|
/// WARPSIZE. The algorithm is implemented in the runtime as follows:
|
|
///
|
|
/// void
|
|
/// contiguous_partial_reduce(void *reduce_data,
|
|
/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
|
|
/// int size, int lane_id) {
|
|
/// int curr_size;
|
|
/// int offset;
|
|
/// curr_size = size;
|
|
/// mask = curr_size/2;
|
|
/// while (offset>0) {
|
|
/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
|
|
/// curr_size = (curr_size+1)/2;
|
|
/// offset = curr_size/2;
|
|
/// }
|
|
/// }
|
|
///
|
|
/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
|
|
///
|
|
/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
|
|
/// if (lane_id < offset)
|
|
/// reduce_elem = reduce_elem REDUCE_OP remote_elem
|
|
/// else
|
|
/// reduce_elem = remote_elem
|
|
///
|
|
/// This algorithm assumes that the data to be reduced are located in a
|
|
/// contiguous subset of lanes starting from the first. When there is
|
|
/// an odd number of active lanes, the data in the last lane is not
|
|
/// aggregated with any other lane's dat but is instead copied over.
|
|
///
|
|
/// Dispersed Partial Warp Reduction
|
|
///
|
|
/// This algorithm is used within a warp when any discontiguous subset of
|
|
/// lanes are active. It is used to implement the reduction operation
|
|
/// across lanes in an OpenMP simd region or in a nested parallel region.
|
|
///
|
|
/// void
|
|
/// dispersed_partial_reduce(void *reduce_data,
|
|
/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
|
/// int size, remote_id;
|
|
/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
|
|
/// do {
|
|
/// remote_id = next_active_lane_id_right_after_me();
|
|
/// # the above function returns 0 of no active lane
|
|
/// # is present right after the current lane.
|
|
/// size = number_of_active_lanes_in_this_warp();
|
|
/// logical_lane_id /= 2;
|
|
/// ShuffleReduceFn(reduce_data, logical_lane_id,
|
|
/// remote_id-1-threadIdx.x, 2);
|
|
/// } while (logical_lane_id % 2 == 0 && size > 1);
|
|
/// }
|
|
///
|
|
/// There is no assumption made about the initial state of the reduction.
|
|
/// Any number of lanes (>=1) could be active at any position. The reduction
|
|
/// result is returned in the first active lane.
|
|
///
|
|
/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
|
|
///
|
|
/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
|
|
/// if (lane_id % 2 == 0 && offset > 0)
|
|
/// reduce_elem = reduce_elem REDUCE_OP remote_elem
|
|
/// else
|
|
/// reduce_elem = remote_elem
|
|
///
|
|
///
|
|
/// Intra-Team Reduction
|
|
///
|
|
/// This function, as implemented in the runtime call
|
|
/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
|
|
/// threads in a team. It first reduces within a warp using the
|
|
/// aforementioned algorithms. We then proceed to gather all such
|
|
/// reduced values at the first warp.
|
|
///
|
|
/// The runtime makes use of the function 'InterWarpCpyFn', which copies
|
|
/// data from each of the "warp master" (zeroth lane of each warp, where
|
|
/// warp-reduced data is held) to the zeroth warp. This step reduces (in
|
|
/// a mathematical sense) the problem of reduction across warp masters in
|
|
/// a block to the problem of warp reduction.
|
|
///
|
|
///
|
|
/// Inter-Team Reduction
|
|
///
|
|
/// Once a team has reduced its data to a single value, it is stored in
|
|
/// a global scratchpad array. Since each team has a distinct slot, this
|
|
/// can be done without locking.
|
|
///
|
|
/// The last team to write to the scratchpad array proceeds to reduce the
|
|
/// scratchpad array. One or more workers in the last team use the helper
|
|
/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
|
|
/// the k'th worker reduces every k'th element.
|
|
///
|
|
/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
|
|
/// reduce across workers and compute a globally reduced value.
|
|
///
|
|
/// \param Loc The location where the reduction was
|
|
/// encountered. Must be within the associate
|
|
/// directive and after the last local access to the
|
|
/// reduction variables.
|
|
/// \param AllocaIP An insertion point suitable for allocas usable
|
|
/// in reductions.
|
|
/// \param CodeGenIP An insertion point suitable for code
|
|
/// generation. \param ReductionInfos A list of info on each reduction
|
|
/// variable. \param IsNoWait Optional flag set if the reduction is
|
|
/// marked as
|
|
/// nowait.
|
|
/// \param IsTeamsReduction Optional flag set if it is a teams
|
|
/// reduction.
|
|
/// \param HasDistribute Optional flag set if it is a
|
|
/// distribute reduction.
|
|
/// \param GridValue Optional GPU grid value.
|
|
/// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
|
|
/// used for teams reduction.
|
|
/// \param SrcLocInfo Source location information global.
|
|
InsertPointTy createReductionsGPU(
|
|
const LocationDescription &Loc, InsertPointTy AllocaIP,
|
|
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
|
|
bool IsNoWait = false, bool IsTeamsReduction = false,
|
|
bool HasDistribute = false,
|
|
ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
|
|
std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
|
|
Value *SrcLocInfo = nullptr);
|
|
|
|
// TODO: provide atomic and non-atomic reduction generators for reduction
|
|
// operators defined by the OpenMP specification.
|
|
|
|
/// Generator for '#omp reduction'.
|
|
///
|
|
/// Emits the IR instructing the runtime to perform the specific kind of
|
|
/// reductions. Expects reduction variables to have been privatized and
|
|
/// initialized to reduction-neutral values separately. Emits the calls to
|
|
/// runtime functions as well as the reduction function and the basic blocks
|
|
/// performing the reduction atomically and non-atomically.
|
|
///
|
|
/// The code emitted for the following:
|
|
///
|
|
/// \code
|
|
/// type var_1;
|
|
/// type var_2;
|
|
/// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
|
|
/// /* body */;
|
|
/// \endcode
|
|
///
|
|
/// corresponds to the following sketch.
|
|
///
|
|
/// \code
|
|
/// void _outlined_par() {
|
|
/// // N is the number of different reductions.
|
|
/// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
|
|
/// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
|
|
/// _omp_reduction_func,
|
|
/// _gomp_critical_user.reduction.var)) {
|
|
/// case 1: {
|
|
/// var_1 = var_1 <reduction-op> privatized_var_1;
|
|
/// var_2 = var_2 <reduction-op> privatized_var_2;
|
|
/// // ...
|
|
/// __kmpc_end_reduce(...);
|
|
/// break;
|
|
/// }
|
|
/// case 2: {
|
|
/// _Atomic<ReductionOp>(var_1, privatized_var_1);
|
|
/// _Atomic<ReductionOp>(var_2, privatized_var_2);
|
|
/// // ...
|
|
/// break;
|
|
/// }
|
|
/// default: break;
|
|
/// }
|
|
/// }
|
|
///
|
|
/// void _omp_reduction_func(void **lhs, void **rhs) {
|
|
/// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
|
|
/// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
|
|
/// // ...
|
|
/// }
|
|
/// \endcode
|
|
///
|
|
/// \param Loc The location where the reduction was
|
|
/// encountered. Must be within the associate
|
|
/// directive and after the last local access to the
|
|
/// reduction variables.
|
|
/// \param AllocaIP An insertion point suitable for allocas usable
|
|
/// in reductions.
|
|
/// \param ReductionInfos A list of info on each reduction variable.
|
|
/// \param IsNoWait A flag set if the reduction is marked as nowait.
|
|
/// \param IsByRef A flag set if the reduction is using reference
|
|
/// or direct value.
|
|
InsertPointTy createReductions(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP,
|
|
ArrayRef<ReductionInfo> ReductionInfos,
|
|
ArrayRef<bool> IsByRef, bool IsNoWait = false);
|
|
|
|
///}
|
|
|
|
/// Return the insertion point used by the underlying IRBuilder.
|
|
InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
|
|
|
|
/// Update the internal location to \p Loc.
|
|
bool updateToLocation(const LocationDescription &Loc) {
|
|
Builder.restoreIP(Loc.IP);
|
|
Builder.SetCurrentDebugLocation(Loc.DL);
|
|
return Loc.IP.getBlock() != nullptr;
|
|
}
|
|
|
|
/// Return the function declaration for the runtime function with \p FnID.
|
|
FunctionCallee getOrCreateRuntimeFunction(Module &M,
|
|
omp::RuntimeFunction FnID);
|
|
|
|
Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
|
|
|
|
/// Return the (LLVM-IR) string describing the source location \p LocStr.
|
|
Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
|
|
|
|
/// Return the (LLVM-IR) string describing the default source location.
|
|
Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
|
|
|
|
/// Return the (LLVM-IR) string describing the source location identified by
|
|
/// the arguments.
|
|
Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
|
|
unsigned Line, unsigned Column,
|
|
uint32_t &SrcLocStrSize);
|
|
|
|
/// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
|
|
/// fallback if \p DL does not specify the function name.
|
|
Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
|
|
Function *F = nullptr);
|
|
|
|
/// Return the (LLVM-IR) string describing the source location \p Loc.
|
|
Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
|
|
uint32_t &SrcLocStrSize);
|
|
|
|
/// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
|
|
/// TODO: Create a enum class for the Reserve2Flags
|
|
Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
|
|
omp::IdentFlag Flags = omp::IdentFlag(0),
|
|
unsigned Reserve2Flags = 0);
|
|
|
|
/// Create a hidden global flag \p Name in the module with initial value \p
|
|
/// Value.
|
|
GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
|
|
|
|
/// Generate control flow and cleanup for cancellation.
|
|
///
|
|
/// \param CancelFlag Flag indicating if the cancellation is performed.
|
|
/// \param CanceledDirective The kind of directive that is cancled.
|
|
/// \param ExitCB Extra code to be generated in the exit block.
|
|
void emitCancelationCheckImpl(Value *CancelFlag,
|
|
omp::Directive CanceledDirective,
|
|
FinalizeCallbackTy ExitCB = {});
|
|
|
|
/// Generate a target region entry call.
|
|
///
|
|
/// \param Loc The location at which the request originated and is fulfilled.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param Return Return value of the created function returned by reference.
|
|
/// \param DeviceID Identifier for the device via the 'device' clause.
|
|
/// \param NumTeams Numer of teams for the region via the 'num_teams' clause
|
|
/// or 0 if unspecified and -1 if there is no 'teams' clause.
|
|
/// \param NumThreads Number of threads via the 'thread_limit' clause.
|
|
/// \param HostPtr Pointer to the host-side pointer of the target kernel.
|
|
/// \param KernelArgs Array of arguments to the kernel.
|
|
InsertPointTy emitTargetKernel(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP, Value *&Return,
|
|
Value *Ident, Value *DeviceID, Value *NumTeams,
|
|
Value *NumThreads, Value *HostPtr,
|
|
ArrayRef<Value *> KernelArgs);
|
|
|
|
/// Generate a flush runtime call.
|
|
///
|
|
/// \param Loc The location at which the request originated and is fulfilled.
|
|
void emitFlush(const LocationDescription &Loc);
|
|
|
|
/// The finalization stack made up of finalize callbacks currently in-flight,
|
|
/// wrapped into FinalizationInfo objects that reference also the finalization
|
|
/// target block and the kind of cancellable directive.
|
|
SmallVector<FinalizationInfo, 8> FinalizationStack;
|
|
|
|
/// Return true if the last entry in the finalization stack is of kind \p DK
|
|
/// and cancellable.
|
|
bool isLastFinalizationInfoCancellable(omp::Directive DK) {
|
|
return !FinalizationStack.empty() &&
|
|
FinalizationStack.back().IsCancellable &&
|
|
FinalizationStack.back().DK == DK;
|
|
}
|
|
|
|
/// Generate a taskwait runtime call.
|
|
///
|
|
/// \param Loc The location at which the request originated and is fulfilled.
|
|
void emitTaskwaitImpl(const LocationDescription &Loc);
|
|
|
|
/// Generate a taskyield runtime call.
|
|
///
|
|
/// \param Loc The location at which the request originated and is fulfilled.
|
|
void emitTaskyieldImpl(const LocationDescription &Loc);
|
|
|
|
/// Return the current thread ID.
|
|
///
|
|
/// \param Ident The ident (ident_t*) describing the query origin.
|
|
Value *getOrCreateThreadID(Value *Ident);
|
|
|
|
/// The OpenMPIRBuilder Configuration
|
|
OpenMPIRBuilderConfig Config;
|
|
|
|
/// The underlying LLVM-IR module
|
|
Module &M;
|
|
|
|
/// The LLVM-IR Builder used to create IR.
|
|
IRBuilder<> Builder;
|
|
|
|
/// Map to remember source location strings
|
|
StringMap<Constant *> SrcLocStrMap;
|
|
|
|
/// Map to remember existing ident_t*.
|
|
DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
|
|
|
|
/// Info manager to keep track of target regions.
|
|
OffloadEntriesInfoManager OffloadInfoManager;
|
|
|
|
/// The target triple of the underlying module.
|
|
const Triple T;
|
|
|
|
/// Helper that contains information about regions we need to outline
|
|
/// during finalization.
|
|
struct OutlineInfo {
|
|
using PostOutlineCBTy = std::function<void(Function &)>;
|
|
PostOutlineCBTy PostOutlineCB;
|
|
BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
|
|
SmallVector<Value *, 2> ExcludeArgsFromAggregate;
|
|
|
|
/// Collect all blocks in between EntryBB and ExitBB in both the given
|
|
/// vector and set.
|
|
void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
|
|
SmallVectorImpl<BasicBlock *> &BlockVector);
|
|
|
|
/// Return the function that contains the region to be outlined.
|
|
Function *getFunction() const { return EntryBB->getParent(); }
|
|
};
|
|
|
|
/// Collection of regions that need to be outlined during finalization.
|
|
SmallVector<OutlineInfo, 16> OutlineInfos;
|
|
|
|
/// A collection of candidate target functions that's constant allocas will
|
|
/// attempt to be raised on a call of finalize after all currently enqueued
|
|
/// outline info's have been processed.
|
|
SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
|
|
|
|
/// Collection of owned canonical loop objects that eventually need to be
|
|
/// free'd.
|
|
std::forward_list<CanonicalLoopInfo> LoopInfos;
|
|
|
|
/// Add a new region that will be outlined later.
|
|
void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
|
|
|
|
/// An ordered map of auto-generated variables to their unique names.
|
|
/// It stores variables with the following names: 1) ".gomp_critical_user_" +
|
|
/// <critical_section_name> + ".var" for "omp critical" directives; 2)
|
|
/// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
|
|
/// variables.
|
|
StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
|
|
|
|
/// Computes the size of type in bytes.
|
|
Value *getSizeInBytes(Value *BasePtr);
|
|
|
|
// Emit a branch from the current block to the Target block only if
|
|
// the current block has a terminator.
|
|
void emitBranch(BasicBlock *Target);
|
|
|
|
// If BB has no use then delete it and return. Else place BB after the current
|
|
// block, if possible, or else at the end of the function. Also add a branch
|
|
// from current block to BB if current block does not have a terminator.
|
|
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);
|
|
|
|
/// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
|
|
/// Here is the logic:
|
|
/// if (Cond) {
|
|
/// ThenGen();
|
|
/// } else {
|
|
/// ElseGen();
|
|
/// }
|
|
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
|
|
BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});
|
|
|
|
/// Create the global variable holding the offload mappings information.
|
|
GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
|
|
std::string VarName);
|
|
|
|
/// Create the global variable holding the offload names information.
|
|
GlobalVariable *
|
|
createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
|
|
std::string VarName);
|
|
|
|
struct MapperAllocas {
|
|
AllocaInst *ArgsBase = nullptr;
|
|
AllocaInst *Args = nullptr;
|
|
AllocaInst *ArgSizes = nullptr;
|
|
};
|
|
|
|
/// Create the allocas instruction used in call to mapper functions.
|
|
void createMapperAllocas(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP, unsigned NumOperands,
|
|
struct MapperAllocas &MapperAllocas);
|
|
|
|
/// Create the call for the target mapper function.
|
|
/// \param Loc The source location description.
|
|
/// \param MapperFunc Function to be called.
|
|
/// \param SrcLocInfo Source location information global.
|
|
/// \param MaptypesArg The argument types.
|
|
/// \param MapnamesArg The argument names.
|
|
/// \param MapperAllocas The AllocaInst used for the call.
|
|
/// \param DeviceID Device ID for the call.
|
|
/// \param NumOperands Number of operands in the call.
|
|
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
|
|
Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
|
|
struct MapperAllocas &MapperAllocas, int64_t DeviceID,
|
|
unsigned NumOperands);
|
|
|
|
/// Container for the arguments used to pass data to the runtime library.
|
|
struct TargetDataRTArgs {
|
|
/// The array of base pointer passed to the runtime library.
|
|
Value *BasePointersArray = nullptr;
|
|
/// The array of section pointers passed to the runtime library.
|
|
Value *PointersArray = nullptr;
|
|
/// The array of sizes passed to the runtime library.
|
|
Value *SizesArray = nullptr;
|
|
/// The array of map types passed to the runtime library for the beginning
|
|
/// of the region or for the entire region if there are no separate map
|
|
/// types for the region end.
|
|
Value *MapTypesArray = nullptr;
|
|
/// The array of map types passed to the runtime library for the end of the
|
|
/// region, or nullptr if there are no separate map types for the region
|
|
/// end.
|
|
Value *MapTypesArrayEnd = nullptr;
|
|
/// The array of user-defined mappers passed to the runtime library.
|
|
Value *MappersArray = nullptr;
|
|
/// The array of original declaration names of mapped pointers sent to the
|
|
/// runtime library for debugging
|
|
Value *MapNamesArray = nullptr;
|
|
|
|
explicit TargetDataRTArgs() {}
|
|
explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
|
|
Value *SizesArray, Value *MapTypesArray,
|
|
Value *MapTypesArrayEnd, Value *MappersArray,
|
|
Value *MapNamesArray)
|
|
: BasePointersArray(BasePointersArray), PointersArray(PointersArray),
|
|
SizesArray(SizesArray), MapTypesArray(MapTypesArray),
|
|
MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
|
|
MapNamesArray(MapNamesArray) {}
|
|
};
|
|
|
|
/// Data structure that contains the needed information to construct the
|
|
/// kernel args vector.
|
|
struct TargetKernelArgs {
|
|
/// Number of arguments passed to the runtime library.
|
|
unsigned NumTargetItems = 0;
|
|
/// Arguments passed to the runtime library
|
|
TargetDataRTArgs RTArgs;
|
|
/// The number of iterations
|
|
Value *NumIterations = nullptr;
|
|
/// The number of teams.
|
|
ArrayRef<Value *> NumTeams;
|
|
/// The number of threads.
|
|
ArrayRef<Value *> NumThreads;
|
|
/// The size of the dynamic shared memory.
|
|
Value *DynCGGroupMem = nullptr;
|
|
/// True if the kernel has 'no wait' clause.
|
|
bool HasNoWait = false;
|
|
|
|
// Constructors for TargetKernelArgs.
|
|
TargetKernelArgs() {}
|
|
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
|
|
Value *NumIterations, ArrayRef<Value *> NumTeams,
|
|
ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
|
|
bool HasNoWait)
|
|
: NumTargetItems(NumTargetItems), RTArgs(RTArgs),
|
|
NumIterations(NumIterations), NumTeams(NumTeams),
|
|
NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
|
|
HasNoWait(HasNoWait) {}
|
|
};
|
|
|
|
/// Create the kernel args vector used by emitTargetKernel. This function
|
|
/// creates various constant values that are used in the resulting args
|
|
/// vector.
|
|
static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
|
|
IRBuilderBase &Builder,
|
|
SmallVector<Value *> &ArgsVector);
|
|
|
|
/// Struct that keeps the information that should be kept throughout
|
|
/// a 'target data' region.
|
|
class TargetDataInfo {
|
|
/// Set to true if device pointer information have to be obtained.
|
|
bool RequiresDevicePointerInfo = false;
|
|
/// Set to true if Clang emits separate runtime calls for the beginning and
|
|
/// end of the region. These calls might have separate map type arrays.
|
|
bool SeparateBeginEndCalls = false;
|
|
|
|
public:
|
|
TargetDataRTArgs RTArgs;
|
|
|
|
SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
|
|
DevicePtrInfoMap;
|
|
|
|
/// Indicate whether any user-defined mapper exists.
|
|
bool HasMapper = false;
|
|
/// The total number of pointers passed to the runtime library.
|
|
unsigned NumberOfPtrs = 0u;
|
|
|
|
bool EmitDebug = false;
|
|
|
|
explicit TargetDataInfo() {}
|
|
explicit TargetDataInfo(bool RequiresDevicePointerInfo,
|
|
bool SeparateBeginEndCalls)
|
|
: RequiresDevicePointerInfo(RequiresDevicePointerInfo),
|
|
SeparateBeginEndCalls(SeparateBeginEndCalls) {}
|
|
/// Clear information about the data arrays.
|
|
void clearArrayInfo() {
|
|
RTArgs = TargetDataRTArgs();
|
|
HasMapper = false;
|
|
NumberOfPtrs = 0u;
|
|
}
|
|
/// Return true if the current target data information has valid arrays.
|
|
bool isValid() {
|
|
return RTArgs.BasePointersArray && RTArgs.PointersArray &&
|
|
RTArgs.SizesArray && RTArgs.MapTypesArray &&
|
|
(!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
|
|
}
|
|
bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
|
|
bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
|
|
};
|
|
|
|
enum class DeviceInfoTy { None, Pointer, Address };
|
|
using MapValuesArrayTy = SmallVector<Value *, 4>;
|
|
using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
|
|
using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
|
|
using MapNamesArrayTy = SmallVector<Constant *, 4>;
|
|
using MapDimArrayTy = SmallVector<uint64_t, 4>;
|
|
using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
|
|
|
|
/// This structure contains combined information generated for mappable
|
|
/// clauses, including base pointers, pointers, sizes, map types, user-defined
|
|
/// mappers, and non-contiguous information.
|
|
struct MapInfosTy {
|
|
struct StructNonContiguousInfo {
|
|
bool IsNonContiguous = false;
|
|
MapDimArrayTy Dims;
|
|
MapNonContiguousArrayTy Offsets;
|
|
MapNonContiguousArrayTy Counts;
|
|
MapNonContiguousArrayTy Strides;
|
|
};
|
|
MapValuesArrayTy BasePointers;
|
|
MapValuesArrayTy Pointers;
|
|
MapDeviceInfoArrayTy DevicePointers;
|
|
MapValuesArrayTy Sizes;
|
|
MapFlagsArrayTy Types;
|
|
MapNamesArrayTy Names;
|
|
StructNonContiguousInfo NonContigInfo;
|
|
|
|
/// Append arrays in \a CurInfo.
|
|
void append(MapInfosTy &CurInfo) {
|
|
BasePointers.append(CurInfo.BasePointers.begin(),
|
|
CurInfo.BasePointers.end());
|
|
Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
|
|
DevicePointers.append(CurInfo.DevicePointers.begin(),
|
|
CurInfo.DevicePointers.end());
|
|
Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
|
|
Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
|
|
Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
|
|
NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
|
|
CurInfo.NonContigInfo.Dims.end());
|
|
NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
|
|
CurInfo.NonContigInfo.Offsets.end());
|
|
NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
|
|
CurInfo.NonContigInfo.Counts.end());
|
|
NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
|
|
CurInfo.NonContigInfo.Strides.end());
|
|
}
|
|
};
|
|
|
|
/// Callback function type for functions emitting the host fallback code that
|
|
/// is executed when the kernel launch fails. It takes an insertion point as
|
|
/// parameter where the code should be emitted. It returns an insertion point
|
|
/// that points right after after the emitted code.
|
|
using EmitFallbackCallbackTy = function_ref<InsertPointTy(InsertPointTy)>;
|
|
|
|
/// Generate a target region entry call and host fallback call.
|
|
///
|
|
/// \param Loc The location at which the request originated and is fulfilled.
|
|
/// \param OutlinedFn The outlined kernel function.
|
|
/// \param OutlinedFnID The ooulined function ID.
|
|
/// \param EmitTargetCallFallbackCB Call back function to generate host
|
|
/// fallback code.
|
|
/// \param Args Data structure holding information about the kernel arguments.
|
|
/// \param DeviceID Identifier for the device via the 'device' clause.
|
|
/// \param RTLoc Source location identifier
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
InsertPointTy emitKernelLaunch(
|
|
const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
|
|
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
|
|
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
|
|
|
|
/// Generate a target-task for the target construct
|
|
///
|
|
/// \param OutlinedFn The outlined device/target kernel function.
|
|
/// \param OutlinedFnID The ooulined function ID.
|
|
/// \param EmitTargetCallFallbackCB Call back function to generate host
|
|
/// fallback code.
|
|
/// \param Args Data structure holding information about the kernel arguments.
|
|
/// \param DeviceID Identifier for the device via the 'device' clause.
|
|
/// \param RTLoc Source location identifier
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param Dependencies Vector of DependData objects holding information of
|
|
/// dependencies as specified by the 'depend' clause.
|
|
/// \param HasNoWait True if the target construct had 'nowait' on it, false
|
|
/// otherwise
|
|
InsertPointTy emitTargetTask(
|
|
Function *OutlinedFn, Value *OutlinedFnID,
|
|
EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
|
|
Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
|
|
SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
|
|
|
|
/// Emit the arguments to be passed to the runtime library based on the
|
|
/// arrays of base pointers, pointers, sizes, map types, and mappers. If
|
|
/// ForEndCall, emit map types to be passed for the end of the region instead
|
|
/// of the beginning.
|
|
void emitOffloadingArraysArgument(IRBuilderBase &Builder,
|
|
OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
|
|
OpenMPIRBuilder::TargetDataInfo &Info,
|
|
bool ForEndCall = false);
|
|
|
|
/// Emit an array of struct descriptors to be assigned to the offload args.
|
|
void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
|
|
InsertPointTy CodeGenIP,
|
|
MapInfosTy &CombinedInfo,
|
|
TargetDataInfo &Info);
|
|
|
|
/// Emit the arrays used to pass the captures and map information to the
|
|
/// offloading runtime library. If there is no map or capture information,
|
|
/// return nullptr by reference. Accepts a reference to a MapInfosTy object
|
|
/// that contains information generated for mappable clauses,
|
|
/// including base pointers, pointers, sizes, map types, user-defined mappers.
|
|
void emitOffloadingArrays(
|
|
InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
|
|
TargetDataInfo &Info, bool IsNonContiguous = false,
|
|
function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
|
|
function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
|
|
|
|
/// Allocates memory for and populates the arrays required for offloading
|
|
/// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
|
|
/// emits their base addresses as arguments to be passed to the runtime
|
|
/// library. In essence, this function is a combination of
|
|
/// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
|
|
/// be preferred by clients of OpenMPIRBuilder.
|
|
void emitOffloadingArraysAndArgs(
|
|
InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
|
|
TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
|
|
bool IsNonContiguous = false, bool ForEndCall = false,
|
|
function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
|
|
function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
|
|
|
|
/// Creates offloading entry for the provided entry ID \a ID, address \a
|
|
/// Addr, size \a Size, and flags \a Flags.
|
|
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
|
|
int32_t Flags, GlobalValue::LinkageTypes,
|
|
StringRef Name = "");
|
|
|
|
/// The kind of errors that can occur when emitting the offload entries and
|
|
/// metadata.
|
|
enum EmitMetadataErrorKind {
|
|
EMIT_MD_TARGET_REGION_ERROR,
|
|
EMIT_MD_DECLARE_TARGET_ERROR,
|
|
EMIT_MD_GLOBAL_VAR_LINK_ERROR
|
|
};
|
|
|
|
/// Callback function type
|
|
using EmitMetadataErrorReportFunctionTy =
|
|
std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
|
|
|
|
// Emit the offloading entries and metadata so that the device codegen side
|
|
// can easily figure out what to emit. The produced metadata looks like
|
|
// this:
|
|
//
|
|
// !omp_offload.info = !{!1, ...}
|
|
//
|
|
// We only generate metadata for function that contain target regions.
|
|
void createOffloadEntriesAndInfoMetadata(
|
|
EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
|
|
|
|
public:
|
|
/// Generator for __kmpc_copyprivate
|
|
///
|
|
/// \param Loc The source location description.
|
|
/// \param BufSize Number of elements in the buffer.
|
|
/// \param CpyBuf List of pointers to data to be copied.
|
|
/// \param CpyFn function to call for copying data.
|
|
/// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
|
|
///
|
|
/// \return The insertion position *after* the CopyPrivate call.
|
|
|
|
InsertPointTy createCopyPrivate(const LocationDescription &Loc,
|
|
llvm::Value *BufSize, llvm::Value *CpyBuf,
|
|
llvm::Value *CpyFn, llvm::Value *DidIt);
|
|
|
|
/// Generator for '#omp single'
|
|
///
|
|
/// \param Loc The source location description.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \param IsNowait If false, a barrier is emitted.
|
|
/// \param CPVars copyprivate variables.
|
|
/// \param CPFuncs copy functions to use for each copyprivate variable.
|
|
///
|
|
/// \returns The insertion position *after* the single call.
|
|
InsertPointTy createSingle(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB, bool IsNowait,
|
|
ArrayRef<llvm::Value *> CPVars = {},
|
|
ArrayRef<llvm::Function *> CPFuncs = {});
|
|
|
|
/// Generator for '#omp master'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
///
|
|
/// \returns The insertion position *after* the master.
|
|
InsertPointTy createMaster(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB);
|
|
|
|
/// Generator for '#omp masked'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param FiniCB Callback to finialize variable copies.
|
|
///
|
|
/// \returns The insertion position *after* the masked.
|
|
InsertPointTy createMasked(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB, Value *Filter);
|
|
|
|
/// Generator for '#omp critical'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the region body code.
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \param CriticalName name of the lock used by the critical directive
|
|
/// \param HintInst Hint Instruction for hint clause associated with critical
|
|
///
|
|
/// \returns The insertion position *after* the critical.
|
|
InsertPointTy createCritical(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB,
|
|
StringRef CriticalName, Value *HintInst);
|
|
|
|
/// Generator for '#omp ordered depend (source | sink)'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param NumLoops The number of loops in depend clause.
|
|
/// \param StoreValues The value will be stored in vector address.
|
|
/// \param Name The name of alloca instruction.
|
|
/// \param IsDependSource If true, depend source; otherwise, depend sink.
|
|
///
|
|
/// \return The insertion position *after* the ordered.
|
|
InsertPointTy createOrderedDepend(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP, unsigned NumLoops,
|
|
ArrayRef<llvm::Value *> StoreValues,
|
|
const Twine &Name, bool IsDependSource);
|
|
|
|
/// Generator for '#omp ordered [threads | simd]'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \param IsThreads If true, with threads clause or without clause;
|
|
/// otherwise, with simd clause;
|
|
///
|
|
/// \returns The insertion position *after* the ordered.
|
|
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB,
|
|
bool IsThreads);
|
|
|
|
/// Generator for '#omp sections'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AllocaIP The insertion points to be used for alloca instructions.
|
|
/// \param SectionCBs Callbacks that will generate body of each section.
|
|
/// \param PrivCB Callback to copy a given variable (think copy constructor).
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \param IsCancellable Flag to indicate a cancellable parallel region.
|
|
/// \param IsNowait If true, barrier - to ensure all sections are executed
|
|
/// before moving forward will not be generated.
|
|
/// \returns The insertion position *after* the sections.
|
|
InsertPointTy createSections(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP,
|
|
ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
|
|
PrivatizeCallbackTy PrivCB,
|
|
FinalizeCallbackTy FiniCB, bool IsCancellable,
|
|
bool IsNowait);
|
|
|
|
/// Generator for '#omp section'
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param BodyGenCB Callback that will generate the region body code.
|
|
/// \param FiniCB Callback to finalize variable copies.
|
|
/// \returns The insertion position *after* the section.
|
|
InsertPointTy createSection(const LocationDescription &Loc,
|
|
BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB);
|
|
|
|
/// Generator for `#omp teams`
|
|
///
|
|
/// \param Loc The location where the teams construct was encountered.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
|
|
/// it is as if lower bound is specified as equal to upperbound. If
|
|
/// this is non-null, then upperbound must also be non-null.
|
|
/// \param NumTeamsUpper Upper bound on the number of teams.
|
|
/// \param ThreadLimit on the number of threads that may participate in a
|
|
/// contention group created by each team.
|
|
/// \param IfExpr is the integer argument value of the if condition on the
|
|
/// teams clause.
|
|
InsertPointTy
|
|
createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
|
|
Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
|
|
Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);
|
|
|
|
/// Generate conditional branch and relevant BasicBlocks through which private
|
|
/// threads copy the 'copyin' variables from Master copy to threadprivate
|
|
/// copies.
|
|
///
|
|
/// \param IP insertion block for copyin conditional
|
|
/// \param MasterVarPtr a pointer to the master variable
|
|
/// \param PrivateVarPtr a pointer to the threadprivate variable
|
|
/// \param IntPtrTy Pointer size type
|
|
/// \param BranchtoEnd Create a branch between the copyin.not.master blocks
|
|
// and copy.in.end block
|
|
///
|
|
/// \returns The insertion point where copying operation to be emitted.
|
|
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
|
|
Value *PrivateAddr,
|
|
llvm::IntegerType *IntPtrTy,
|
|
bool BranchtoEnd = true);
|
|
|
|
/// Create a runtime call for kmpc_Alloc
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param Size Size of allocated memory space
|
|
/// \param Allocator Allocator information instruction
|
|
/// \param Name Name of call Instruction for OMP_alloc
|
|
///
|
|
/// \returns CallInst to the OMP_Alloc call
|
|
CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
|
|
Value *Allocator, std::string Name = "");
|
|
|
|
/// Create a runtime call for kmpc_free
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param Addr Address of memory space to be freed
|
|
/// \param Allocator Allocator information instruction
|
|
/// \param Name Name of call Instruction for OMP_Free
|
|
///
|
|
/// \returns CallInst to the OMP_Free call
|
|
CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
|
|
Value *Allocator, std::string Name = "");
|
|
|
|
/// Create a runtime call for kmpc_threadprivate_cached
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param Pointer pointer to data to be cached
|
|
/// \param Size size of data to be cached
|
|
/// \param Name Name of call Instruction for callinst
|
|
///
|
|
/// \returns CallInst to the thread private cache call.
|
|
CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
|
|
llvm::Value *Pointer,
|
|
llvm::ConstantInt *Size,
|
|
const llvm::Twine &Name = Twine(""));
|
|
|
|
/// Create a runtime call for __tgt_interop_init
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param InteropVar variable to be allocated
|
|
/// \param InteropType type of interop operation
|
|
/// \param Device devide to which offloading will occur
|
|
/// \param NumDependences number of dependence variables
|
|
/// \param DependenceAddress pointer to dependence variables
|
|
/// \param HaveNowaitClause does nowait clause exist
|
|
///
|
|
/// \returns CallInst to the __tgt_interop_init call
|
|
CallInst *createOMPInteropInit(const LocationDescription &Loc,
|
|
Value *InteropVar,
|
|
omp::OMPInteropType InteropType, Value *Device,
|
|
Value *NumDependences,
|
|
Value *DependenceAddress,
|
|
bool HaveNowaitClause);
|
|
|
|
/// Create a runtime call for __tgt_interop_destroy
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param InteropVar variable to be allocated
|
|
/// \param Device devide to which offloading will occur
|
|
/// \param NumDependences number of dependence variables
|
|
/// \param DependenceAddress pointer to dependence variables
|
|
/// \param HaveNowaitClause does nowait clause exist
|
|
///
|
|
/// \returns CallInst to the __tgt_interop_destroy call
|
|
CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
|
|
Value *InteropVar, Value *Device,
|
|
Value *NumDependences,
|
|
Value *DependenceAddress,
|
|
bool HaveNowaitClause);
|
|
|
|
/// Create a runtime call for __tgt_interop_use
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param InteropVar variable to be allocated
|
|
/// \param Device devide to which offloading will occur
|
|
/// \param NumDependences number of dependence variables
|
|
/// \param DependenceAddress pointer to dependence variables
|
|
/// \param HaveNowaitClause does nowait clause exist
|
|
///
|
|
/// \returns CallInst to the __tgt_interop_use call
|
|
CallInst *createOMPInteropUse(const LocationDescription &Loc,
|
|
Value *InteropVar, Value *Device,
|
|
Value *NumDependences, Value *DependenceAddress,
|
|
bool HaveNowaitClause);
|
|
|
|
/// The `omp target` interface
|
|
///
|
|
/// For more information about the usage of this interface,
|
|
/// \see openmp/libomptarget/deviceRTLs/common/include/target.h
|
|
///
|
|
///{
|
|
|
|
/// Create a runtime call for kmpc_target_init
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
|
|
/// \param MinThreads Minimal number of threads, or 0.
|
|
/// \param MaxThreads Maximal number of threads, or 0.
|
|
/// \param MinTeams Minimal number of teams, or 0.
|
|
/// \param MaxTeams Maximal number of teams, or 0.
|
|
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
|
|
int32_t MinThreadsVal = 0,
|
|
int32_t MaxThreadsVal = 0,
|
|
int32_t MinTeamsVal = 0,
|
|
int32_t MaxTeamsVal = 0);
|
|
|
|
/// Create a runtime call for kmpc_target_deinit
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param TeamsReductionDataSize The maximal size of all the reduction data
|
|
/// for teams reduction.
|
|
/// \param TeamsReductionBufferLength The number of elements (each of up to
|
|
/// \p TeamsReductionDataSize size), in the teams reduction buffer.
|
|
void createTargetDeinit(const LocationDescription &Loc,
|
|
int32_t TeamsReductionDataSize = 0,
|
|
int32_t TeamsReductionBufferLength = 1024);
|
|
|
|
///}
|
|
|
|
/// Helpers to read/write kernel annotations from the IR.
|
|
///
|
|
///{
|
|
|
|
/// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
|
|
/// is set.
|
|
static std::pair<int32_t, int32_t>
|
|
readThreadBoundsForKernel(const Triple &T, Function &Kernel);
|
|
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
|
|
int32_t LB, int32_t UB);
|
|
|
|
/// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
|
|
/// is set.
|
|
static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
|
|
Function &Kernel);
|
|
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
|
|
int32_t UB);
|
|
///}
|
|
|
|
private:
|
|
// Sets the function attributes expected for the outlined function
|
|
void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
|
|
|
|
// Creates the function ID/Address for the given outlined function.
|
|
// In the case of an embedded device function the address of the function is
|
|
// used, in the case of a non-offload function a constant is created.
|
|
Constant *createOutlinedFunctionID(Function *OutlinedFn,
|
|
StringRef EntryFnIDName);
|
|
|
|
// Creates the region entry address for the outlined function
|
|
Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
|
|
StringRef EntryFnName);
|
|
|
|
public:
|
|
/// Functions used to generate a function with the given name.
|
|
using FunctionGenCallback = std::function<Function *(StringRef FunctionName)>;
|
|
|
|
/// Create a unique name for the entry function using the source location
|
|
/// information of the current target region. The name will be something like:
|
|
///
|
|
/// __omp_offloading_DD_FFFF_PP_lBB[_CC]
|
|
///
|
|
/// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
|
|
/// mangled name of the function that encloses the target region and BB is the
|
|
/// line number of the target region. CC is a count added when more than one
|
|
/// region is located at the same location.
|
|
///
|
|
/// If this target outline function is not an offload entry, we don't need to
|
|
/// register it. This may happen if it is guarded by an if clause that is
|
|
/// false at compile time, or no target archs have been specified.
|
|
///
|
|
/// The created target region ID is used by the runtime library to identify
|
|
/// the current target region, so it only has to be unique and not
|
|
/// necessarily point to anything. It could be the pointer to the outlined
|
|
/// function that implements the target region, but we aren't using that so
|
|
/// that the compiler doesn't need to keep that, and could therefore inline
|
|
/// the host function if proven worthwhile during optimization. In the other
|
|
/// hand, if emitting code for the device, the ID has to be the function
|
|
/// address so that it can retrieved from the offloading entry and launched
|
|
/// by the runtime library. We also mark the outlined function to have
|
|
/// external linkage in case we are emitting code for the device, because
|
|
/// these functions will be entry points to the device.
|
|
///
|
|
/// \param InfoManager The info manager keeping track of the offload entries
|
|
/// \param EntryInfo The entry information about the function
|
|
/// \param GenerateFunctionCallback The callback function to generate the code
|
|
/// \param OutlinedFunction Pointer to the outlined function
|
|
/// \param EntryFnIDName Name of the ID o be created
|
|
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
|
|
FunctionGenCallback &GenerateFunctionCallback,
|
|
bool IsOffloadEntry, Function *&OutlinedFn,
|
|
Constant *&OutlinedFnID);
|
|
|
|
/// Registers the given function and sets up the attribtues of the function
|
|
/// Returns the FunctionID.
|
|
///
|
|
/// \param InfoManager The info manager keeping track of the offload entries
|
|
/// \param EntryInfo The entry information about the function
|
|
/// \param OutlinedFunction Pointer to the outlined function
|
|
/// \param EntryFnName Name of the outlined function
|
|
/// \param EntryFnIDName Name of the ID o be created
|
|
Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
|
|
Function *OutlinedFunction,
|
|
StringRef EntryFnName,
|
|
StringRef EntryFnIDName);
|
|
|
|
/// Type of BodyGen to use for region codegen
|
|
///
|
|
/// Priv: If device pointer privatization is required, emit the body of the
|
|
/// region here. It will have to be duplicated: with and without
|
|
/// privatization.
|
|
/// DupNoPriv: If we need device pointer privatization, we need
|
|
/// to emit the body of the region with no privatization in the 'else' branch
|
|
/// of the conditional.
|
|
/// NoPriv: If we don't require privatization of device
|
|
/// pointers, we emit the body in between the runtime calls. This avoids
|
|
/// duplicating the body code.
|
|
enum BodyGenTy { Priv, DupNoPriv, NoPriv };
|
|
|
|
/// Callback type for creating the map infos for the kernel parameters.
|
|
/// \param CodeGenIP is the insertion point where code should be generated,
|
|
/// if any.
|
|
using GenMapInfoCallbackTy =
|
|
function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
|
|
|
|
/// Generator for '#omp target data'
|
|
///
|
|
/// \param Loc The location where the target data construct was encountered.
|
|
/// \param AllocaIP The insertion points to be used for alloca instructions.
|
|
/// \param CodeGenIP The insertion point at which the target directive code
|
|
/// should be placed.
|
|
/// \param IsBegin If true then emits begin mapper call otherwise emits
|
|
/// end mapper call.
|
|
/// \param DeviceID Stores the DeviceID from the device clause.
|
|
/// \param IfCond Value which corresponds to the if clause condition.
|
|
/// \param Info Stores all information realted to the Target Data directive.
|
|
/// \param GenMapInfoCB Callback that populates the MapInfos and returns.
|
|
/// \param BodyGenCB Optional Callback to generate the region code.
|
|
/// \param DeviceAddrCB Optional callback to generate code related to
|
|
/// use_device_ptr and use_device_addr.
|
|
/// \param CustomMapperCB Optional callback to generate code related to
|
|
/// custom mappers.
|
|
OpenMPIRBuilder::InsertPointTy createTargetData(
|
|
const LocationDescription &Loc, InsertPointTy AllocaIP,
|
|
InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
|
|
TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
|
|
omp::RuntimeFunction *MapperFunc = nullptr,
|
|
function_ref<InsertPointTy(InsertPointTy CodeGenIP,
|
|
BodyGenTy BodyGenType)>
|
|
BodyGenCB = nullptr,
|
|
function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
|
|
function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
|
|
Value *SrcLocInfo = nullptr);
|
|
|
|
using TargetBodyGenCallbackTy = function_ref<InsertPointTy(
|
|
InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
|
|
|
|
using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointTy(
|
|
Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
|
|
InsertPointTy CodeGenIP)>;
|
|
|
|
/// Generator for '#omp target'
|
|
///
|
|
/// \param Loc where the target data construct was encountered.
|
|
/// \param IsOffloadEntry whether it is an offload entry.
|
|
/// \param CodeGenIP The insertion point where the call to the outlined
|
|
/// function should be emitted.
|
|
/// \param EntryInfo The entry information about the function.
|
|
/// \param NumTeams Number of teams specified in the num_teams clause.
|
|
/// \param NumThreads Number of teams specified in the thread_limit clause.
|
|
/// \param Inputs The input values to the region that will be passed.
|
|
/// as arguments to the outlined function.
|
|
/// \param BodyGenCB Callback that will generate the region code.
|
|
/// \param ArgAccessorFuncCB Callback that will generate accessors
|
|
/// instructions for passed in target arguments where neccessary
|
|
/// \param Dependencies A vector of DependData objects that carry
|
|
// dependency information as passed in the depend clause
|
|
InsertPointTy
|
|
createTarget(const LocationDescription &Loc, bool IsOffloadEntry,
|
|
OpenMPIRBuilder::InsertPointTy AllocaIP,
|
|
OpenMPIRBuilder::InsertPointTy CodeGenIP,
|
|
TargetRegionEntryInfo &EntryInfo, ArrayRef<int32_t> NumTeams,
|
|
ArrayRef<int32_t> NumThreads, SmallVectorImpl<Value *> &Inputs,
|
|
GenMapInfoCallbackTy GenMapInfoCB,
|
|
TargetBodyGenCallbackTy BodyGenCB,
|
|
TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
|
|
SmallVector<DependData> Dependencies = {});
|
|
|
|
/// Returns __kmpc_for_static_init_* runtime function for the specified
|
|
/// size \a IVSize and sign \a IVSigned. Will create a distribute call
|
|
/// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
|
|
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
|
|
bool IsGPUDistribute);
|
|
|
|
/// Returns __kmpc_dispatch_init_* runtime function for the specified
|
|
/// size \a IVSize and sign \a IVSigned.
|
|
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);
|
|
|
|
/// Returns __kmpc_dispatch_next_* runtime function for the specified
|
|
/// size \a IVSize and sign \a IVSigned.
|
|
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);
|
|
|
|
/// Returns __kmpc_dispatch_fini_* runtime function for the specified
|
|
/// size \a IVSize and sign \a IVSigned.
|
|
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);
|
|
|
|
/// Returns __kmpc_dispatch_deinit runtime function.
|
|
FunctionCallee createDispatchDeinitFunction();
|
|
|
|
/// Declarations for LLVM-IR types (simple, array, function and structure) are
|
|
/// generated below. Their names are defined and used in OpenMPKinds.def. Here
|
|
/// we provide the declarations, the initializeTypes function will provide the
|
|
/// values.
|
|
///
|
|
///{
|
|
#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
|
|
#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
|
|
ArrayType *VarName##Ty = nullptr; \
|
|
PointerType *VarName##PtrTy = nullptr;
|
|
#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
|
|
FunctionType *VarName = nullptr; \
|
|
PointerType *VarName##Ptr = nullptr;
|
|
#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
|
|
StructType *VarName = nullptr; \
|
|
PointerType *VarName##Ptr = nullptr;
|
|
#include "llvm/Frontend/OpenMP/OMPKinds.def"
|
|
|
|
///}
|
|
|
|
private:
|
|
/// Create all simple and struct types exposed by the runtime and remember
|
|
/// the llvm::PointerTypes of them for easy access later.
|
|
void initializeTypes(Module &M);
|
|
|
|
/// Common interface for generating entry calls for OMP Directives.
|
|
/// if the directive has a region/body, It will set the insertion
|
|
/// point to the body
|
|
///
|
|
/// \param OMPD Directive to generate entry blocks for
|
|
/// \param EntryCall Call to the entry OMP Runtime Function
|
|
/// \param ExitBB block where the region ends.
|
|
/// \param Conditional indicate if the entry call result will be used
|
|
/// to evaluate a conditional of whether a thread will execute
|
|
/// body code or not.
|
|
///
|
|
/// \return The insertion position in exit block
|
|
InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
|
|
BasicBlock *ExitBB,
|
|
bool Conditional = false);
|
|
|
|
/// Common interface to finalize the region
|
|
///
|
|
/// \param OMPD Directive to generate exiting code for
|
|
/// \param FinIP Insertion point for emitting Finalization code and exit call
|
|
/// \param ExitCall Call to the ending OMP Runtime Function
|
|
/// \param HasFinalize indicate if the directive will require finalization
|
|
/// and has a finalization callback in the stack that
|
|
/// should be called.
|
|
///
|
|
/// \return The insertion position in exit block
|
|
InsertPointTy emitCommonDirectiveExit(omp::Directive OMPD,
|
|
InsertPointTy FinIP,
|
|
Instruction *ExitCall,
|
|
bool HasFinalize = true);
|
|
|
|
/// Common Interface to generate OMP inlined regions
|
|
///
|
|
/// \param OMPD Directive to generate inlined region for
|
|
/// \param EntryCall Call to the entry OMP Runtime Function
|
|
/// \param ExitCall Call to the ending OMP Runtime Function
|
|
/// \param BodyGenCB Body code generation callback.
|
|
/// \param FiniCB Finalization Callback. Will be called when finalizing region
|
|
/// \param Conditional indicate if the entry call result will be used
|
|
/// to evaluate a conditional of whether a thread will execute
|
|
/// body code or not.
|
|
/// \param HasFinalize indicate if the directive will require finalization
|
|
/// and has a finalization callback in the stack that
|
|
/// should be called.
|
|
/// \param IsCancellable if HasFinalize is set to true, indicate if the
|
|
/// the directive should be cancellable.
|
|
/// \return The insertion point after the region
|
|
|
|
InsertPointTy
|
|
EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
|
|
Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
|
|
FinalizeCallbackTy FiniCB, bool Conditional = false,
|
|
bool HasFinalize = true, bool IsCancellable = false);
|
|
|
|
/// Get the platform-specific name separator.
|
|
/// \param Parts different parts of the final name that needs separation
|
|
/// \param FirstSeparator First separator used between the initial two
|
|
/// parts of the name.
|
|
/// \param Separator separator used between all of the rest consecutive
|
|
/// parts of the name
|
|
static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
|
|
StringRef FirstSeparator,
|
|
StringRef Separator);
|
|
|
|
/// Returns corresponding lock object for the specified critical region
|
|
/// name. If the lock object does not exist it is created, otherwise the
|
|
/// reference to the existing copy is returned.
|
|
/// \param CriticalName Name of the critical region.
|
|
///
|
|
Value *getOMPCriticalRegionLock(StringRef CriticalName);
|
|
|
|
/// Callback type for Atomic Expression update
|
|
/// ex:
|
|
/// \code{.cpp}
|
|
/// unsigned x = 0;
|
|
/// #pragma omp atomic update
|
|
/// x = Expr(x_old); //Expr() is any legal operation
|
|
/// \endcode
|
|
///
|
|
/// \param XOld the value of the atomic memory address to use for update
|
|
/// \param IRB reference to the IRBuilder to use
|
|
///
|
|
/// \returns Value to update X to.
|
|
using AtomicUpdateCallbackTy =
|
|
const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>;
|
|
|
|
private:
|
|
enum AtomicKind { Read, Write, Update, Capture, Compare };
|
|
|
|
/// Determine whether to emit flush or not
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AO The required atomic ordering
|
|
/// \param AK The OpenMP atomic operation kind used.
|
|
///
|
|
/// \returns wether a flush was emitted or not
|
|
bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
|
|
AtomicOrdering AO, AtomicKind AK);
|
|
|
|
/// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
|
|
/// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
|
|
/// Only Scalar data types.
|
|
///
|
|
/// \param AllocaIP The insertion point to be used for alloca
|
|
/// instructions.
|
|
/// \param X The target atomic pointer to be updated
|
|
/// \param XElemTy The element type of the atomic pointer.
|
|
/// \param Expr The value to update X with.
|
|
/// \param AO Atomic ordering of the generated atomic
|
|
/// instructions.
|
|
/// \param RMWOp The binary operation used for update. If
|
|
/// operation is not supported by atomicRMW,
|
|
/// or belong to {FADD, FSUB, BAD_BINOP}.
|
|
/// Then a `cmpExch` based atomic will be generated.
|
|
/// \param UpdateOp Code generator for complex expressions that cannot be
|
|
/// expressed through atomicrmw instruction.
|
|
/// \param VolatileX true if \a X volatile?
|
|
/// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
|
|
/// update expression, false otherwise.
|
|
/// (e.g. true for X = X BinOp Expr)
|
|
///
|
|
/// \returns A pair of the old value of X before the update, and the value
|
|
/// used for the update.
|
|
std::pair<Value *, Value *>
|
|
emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
|
|
AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
|
|
AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
|
|
bool IsXBinopExpr);
|
|
|
|
/// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
|
|
///
|
|
/// \Return The instruction
|
|
Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
|
|
AtomicRMWInst::BinOp RMWOp);
|
|
|
|
public:
|
|
/// a struct to pack relevant information while generating atomic Ops
|
|
struct AtomicOpValue {
|
|
Value *Var = nullptr;
|
|
Type *ElemTy = nullptr;
|
|
bool IsSigned = false;
|
|
bool IsVolatile = false;
|
|
};
|
|
|
|
/// Emit atomic Read for : V = X --- Only Scalar data types.
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param X The target pointer to be atomically read
|
|
/// \param V Memory address where to store atomically read
|
|
/// value
|
|
/// \param AO Atomic ordering of the generated atomic
|
|
/// instructions.
|
|
///
|
|
/// \return Insertion point after generated atomic read IR.
|
|
InsertPointTy createAtomicRead(const LocationDescription &Loc,
|
|
AtomicOpValue &X, AtomicOpValue &V,
|
|
AtomicOrdering AO);
|
|
|
|
/// Emit atomic write for : X = Expr --- Only Scalar data types.
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param X The target pointer to be atomically written to
|
|
/// \param Expr The value to store.
|
|
/// \param AO Atomic ordering of the generated atomic
|
|
/// instructions.
|
|
///
|
|
/// \return Insertion point after generated atomic Write IR.
|
|
InsertPointTy createAtomicWrite(const LocationDescription &Loc,
|
|
AtomicOpValue &X, Value *Expr,
|
|
AtomicOrdering AO);
|
|
|
|
/// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
|
|
/// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
|
|
/// Only Scalar data types.
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param X The target atomic pointer to be updated
|
|
/// \param Expr The value to update X with.
|
|
/// \param AO Atomic ordering of the generated atomic instructions.
|
|
/// \param RMWOp The binary operation used for update. If operation
|
|
/// is not supported by atomicRMW, or belong to
|
|
/// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
|
|
/// atomic will be generated.
|
|
/// \param UpdateOp Code generator for complex expressions that cannot be
|
|
/// expressed through atomicrmw instruction.
|
|
/// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
|
|
/// update expression, false otherwise.
|
|
/// (e.g. true for X = X BinOp Expr)
|
|
///
|
|
/// \return Insertion point after generated atomic update IR.
|
|
InsertPointTy createAtomicUpdate(const LocationDescription &Loc,
|
|
InsertPointTy AllocaIP, AtomicOpValue &X,
|
|
Value *Expr, AtomicOrdering AO,
|
|
AtomicRMWInst::BinOp RMWOp,
|
|
AtomicUpdateCallbackTy &UpdateOp,
|
|
bool IsXBinopExpr);
|
|
|
|
/// Emit atomic update for constructs: --- Only Scalar data types
|
|
/// V = X; X = X BinOp Expr ,
|
|
/// X = X BinOp Expr; V = X,
|
|
/// V = X; X = Expr BinOp X,
|
|
/// X = Expr BinOp X; V = X,
|
|
/// V = X; X = UpdateOp(X),
|
|
/// X = UpdateOp(X); V = X,
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param AllocaIP The insertion point to be used for alloca instructions.
|
|
/// \param X The target atomic pointer to be updated
|
|
/// \param V Memory address where to store captured value
|
|
/// \param Expr The value to update X with.
|
|
/// \param AO Atomic ordering of the generated atomic instructions
|
|
/// \param RMWOp The binary operation used for update. If
|
|
/// operation is not supported by atomicRMW, or belong to
|
|
/// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
|
|
/// atomic will be generated.
|
|
/// \param UpdateOp Code generator for complex expressions that cannot be
|
|
/// expressed through atomicrmw instruction.
|
|
/// \param UpdateExpr true if X is an in place update of the form
|
|
/// X = X BinOp Expr or X = Expr BinOp X
|
|
/// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
|
|
/// update expression, false otherwise.
|
|
/// (e.g. true for X = X BinOp Expr)
|
|
/// \param IsPostfixUpdate true if original value of 'x' must be stored in
|
|
/// 'v', not an updated one.
|
|
///
|
|
/// \return Insertion point after generated atomic capture IR.
|
|
InsertPointTy
|
|
createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP,
|
|
AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
|
|
AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
|
|
AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
|
|
bool IsPostfixUpdate, bool IsXBinopExpr);
|
|
|
|
/// Emit atomic compare for constructs: --- Only scalar data types
|
|
/// cond-expr-stmt:
|
|
/// x = x ordop expr ? expr : x;
|
|
/// x = expr ordop x ? expr : x;
|
|
/// x = x == e ? d : x;
|
|
/// x = e == x ? d : x; (this one is not in the spec)
|
|
/// cond-update-stmt:
|
|
/// if (x ordop expr) { x = expr; }
|
|
/// if (expr ordop x) { x = expr; }
|
|
/// if (x == e) { x = d; }
|
|
/// if (e == x) { x = d; } (this one is not in the spec)
|
|
/// conditional-update-capture-atomic:
|
|
/// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
|
|
/// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
|
|
/// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
|
|
/// IsFailOnly=true)
|
|
/// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
|
|
/// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
|
|
/// IsFailOnly=true)
|
|
///
|
|
/// \param Loc The insert and source location description.
|
|
/// \param X The target atomic pointer to be updated.
|
|
/// \param V Memory address where to store captured value (for
|
|
/// compare capture only).
|
|
/// \param R Memory address where to store comparison result
|
|
/// (for compare capture with '==' only).
|
|
/// \param E The expected value ('e') for forms that use an
|
|
/// equality comparison or an expression ('expr') for
|
|
/// forms that use 'ordop' (logically an atomic maximum or
|
|
/// minimum).
|
|
/// \param D The desired value for forms that use an equality
|
|
/// comparison. If forms that use 'ordop', it should be
|
|
/// \p nullptr.
|
|
/// \param AO Atomic ordering of the generated atomic instructions.
|
|
/// \param Op Atomic compare operation. It can only be ==, <, or >.
|
|
/// \param IsXBinopExpr True if the conditional statement is in the form where
|
|
/// x is on LHS. It only matters for < or >.
|
|
/// \param IsPostfixUpdate True if original value of 'x' must be stored in
|
|
/// 'v', not an updated one (for compare capture
|
|
/// only).
|
|
/// \param IsFailOnly True if the original value of 'x' is stored to 'v'
|
|
/// only when the comparison fails. This is only valid for
|
|
/// the case the comparison is '=='.
|
|
///
|
|
/// \return Insertion point after generated atomic capture IR.
|
|
InsertPointTy
|
|
createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
|
|
AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
|
|
AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
|
|
bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
|
|
InsertPointTy createAtomicCompare(const LocationDescription &Loc,
|
|
AtomicOpValue &X, AtomicOpValue &V,
|
|
AtomicOpValue &R, Value *E, Value *D,
|
|
AtomicOrdering AO,
|
|
omp::OMPAtomicCompareOp Op,
|
|
bool IsXBinopExpr, bool IsPostfixUpdate,
|
|
bool IsFailOnly, AtomicOrdering Failure);
|
|
|
|
/// Create the control flow structure of a canonical OpenMP loop.
|
|
///
|
|
/// The emitted loop will be disconnected, i.e. no edge to the loop's
|
|
/// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
|
|
/// IRBuilder location is not preserved.
|
|
///
|
|
/// \param DL DebugLoc used for the instructions in the skeleton.
|
|
/// \param TripCount Value to be used for the trip count.
|
|
/// \param F Function in which to insert the BasicBlocks.
|
|
/// \param PreInsertBefore Where to insert BBs that execute before the body,
|
|
/// typically the body itself.
|
|
/// \param PostInsertBefore Where to insert BBs that execute after the body.
|
|
/// \param Name Base name used to derive BB
|
|
/// and instruction names.
|
|
///
|
|
/// \returns The CanonicalLoopInfo that represents the emitted loop.
|
|
CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
|
|
Function *F,
|
|
BasicBlock *PreInsertBefore,
|
|
BasicBlock *PostInsertBefore,
|
|
const Twine &Name = {});
|
|
/// OMP Offload Info Metadata name string
|
|
const std::string ompOffloadInfoName = "omp_offload.info";
|
|
|
|
/// Loads all the offload entries information from the host IR
|
|
/// metadata. This function is only meant to be used with device code
|
|
/// generation.
|
|
///
|
|
/// \param M Module to load Metadata info from. Module passed maybe
|
|
/// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
|
|
void loadOffloadInfoMetadata(Module &M);
|
|
|
|
/// Loads all the offload entries information from the host IR
|
|
/// metadata read from the file passed in as the HostFilePath argument. This
|
|
/// function is only meant to be used with device code generation.
|
|
///
|
|
/// \param HostFilePath The path to the host IR file,
|
|
/// used to load in offload metadata for the device, allowing host and device
|
|
/// to maintain the same metadata mapping.
|
|
void loadOffloadInfoMetadata(StringRef HostFilePath);
|
|
|
|
/// Gets (if variable with the given name already exist) or creates
|
|
/// internal global variable with the specified Name. The created variable has
|
|
/// linkage CommonLinkage by default and is initialized by null value.
|
|
/// \param Ty Type of the global variable. If it is exist already the type
|
|
/// must be the same.
|
|
/// \param Name Name of the variable.
|
|
GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
|
|
unsigned AddressSpace = 0);
|
|
};
|
|
|
|
/// Class to represented the control flow structure of an OpenMP canonical loop.
|
|
///
|
|
/// The control-flow structure is standardized for easy consumption by
|
|
/// directives associated with loops. For instance, the worksharing-loop
|
|
/// construct may change this control flow such that each loop iteration is
|
|
/// executed on only one thread. The constraints of a canonical loop in brief
|
|
/// are:
|
|
///
|
|
/// * The number of loop iterations must have been computed before entering the
|
|
/// loop.
|
|
///
|
|
/// * Has an (unsigned) logical induction variable that starts at zero and
|
|
/// increments by one.
|
|
///
|
|
/// * The loop's CFG itself has no side-effects. The OpenMP specification
|
|
/// itself allows side-effects, but the order in which they happen, including
|
|
/// how often or whether at all, is unspecified. We expect that the frontend
|
|
/// will emit those side-effect instructions somewhere (e.g. before the loop)
|
|
/// such that the CanonicalLoopInfo itself can be side-effect free.
|
|
///
|
|
/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
|
|
/// execution of a loop body that satifies these constraints. It does NOT
|
|
/// represent arbitrary SESE regions that happen to contain a loop. Do not use
|
|
/// CanonicalLoopInfo for such purposes.
|
|
///
|
|
/// The control flow can be described as follows:
|
|
///
|
|
/// Preheader
|
|
/// |
|
|
/// /-> Header
|
|
/// | |
|
|
/// | Cond---\
|
|
/// | | |
|
|
/// | Body |
|
|
/// | | | |
|
|
/// | <...> |
|
|
/// | | | |
|
|
/// \--Latch |
|
|
/// |
|
|
/// Exit
|
|
/// |
|
|
/// After
|
|
///
|
|
/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
|
|
/// including) and end at AfterIP (at the After's first instruction, excluding).
|
|
/// That is, instructions in the Preheader and After blocks (except the
|
|
/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
|
|
/// side-effects. Typically, the Preheader is used to compute the loop's trip
|
|
/// count. The instructions from BodyIP (at the Body block's first instruction,
|
|
/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
|
|
/// control and thus can have side-effects. The body block is the single entry
|
|
/// point into the loop body, which may contain arbitrary control flow as long
|
|
/// as all control paths eventually branch to the Latch block.
|
|
///
|
|
/// TODO: Consider adding another standardized BasicBlock between Body CFG and
|
|
/// Latch to guarantee that there is only a single edge to the latch. It would
|
|
/// make loop transformations easier to not needing to consider multiple
|
|
/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
|
|
/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
|
|
/// executes after each body iteration.
|
|
///
|
|
/// There must be no loop-carried dependencies through llvm::Values. This is
|
|
/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
|
|
/// for the induction variable.
|
|
///
|
|
/// All code in Header, Cond, Latch and Exit (plus the terminator of the
|
|
/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
|
|
/// by assertOK(). They are expected to not be modified unless explicitly
|
|
/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
|
|
/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
|
|
/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
|
|
/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
|
|
/// anymore as its underlying control flow may not exist anymore.
|
|
/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
|
|
/// may also return a new CanonicalLoopInfo that can be passed to other
|
|
/// loop-associated construct implementing methods. These loop-transforming
|
|
/// methods may either create a new CanonicalLoopInfo usually using
|
|
/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
|
|
/// modify one of the input CanonicalLoopInfo and return it as representing the
|
|
/// modified loop. What is done is an implementation detail of
|
|
/// transformation-implementing method and callers should always assume that the
|
|
/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
|
|
/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
|
|
/// created by createCanonicalLoop, such that transforming methods do not have
|
|
/// to special case where the CanonicalLoopInfo originated from.
|
|
///
|
|
/// Generally, methods consuming CanonicalLoopInfo do not need an
|
|
/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
|
|
/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
|
|
/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
|
|
/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
|
|
/// any InsertPoint in the Preheader, After or Block can still be used after
|
|
/// calling such a method.
|
|
///
|
|
/// TODO: Provide mechanisms for exception handling and cancellation points.
|
|
///
|
|
/// Defined outside OpenMPIRBuilder because nested classes cannot be
|
|
/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
|
|
class CanonicalLoopInfo {
|
|
friend class OpenMPIRBuilder;
|
|
|
|
private:
|
|
BasicBlock *Header = nullptr;
|
|
BasicBlock *Cond = nullptr;
|
|
BasicBlock *Latch = nullptr;
|
|
BasicBlock *Exit = nullptr;
|
|
|
|
/// Add the control blocks of this loop to \p BBs.
|
|
///
|
|
/// This does not include any block from the body, including the one returned
|
|
/// by getBody().
|
|
///
|
|
/// FIXME: This currently includes the Preheader and After blocks even though
|
|
/// their content is (mostly) not under CanonicalLoopInfo's control.
|
|
/// Re-evaluated whether this makes sense.
|
|
void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
|
|
|
|
/// Sets the number of loop iterations to the given value. This value must be
|
|
/// valid in the condition block (i.e., defined in the preheader) and is
|
|
/// interpreted as an unsigned integer.
|
|
void setTripCount(Value *TripCount);
|
|
|
|
/// Replace all uses of the canonical induction variable in the loop body with
|
|
/// a new one.
|
|
///
|
|
/// The intended use case is to update the induction variable for an updated
|
|
/// iteration space such that it can stay normalized in the 0...tripcount-1
|
|
/// range.
|
|
///
|
|
/// The \p Updater is called with the (presumable updated) current normalized
|
|
/// induction variable and is expected to return the value that uses of the
|
|
/// pre-updated induction values should use instead, typically dependent on
|
|
/// the new induction variable. This is a lambda (instead of e.g. just passing
|
|
/// the new value) to be able to distinguish the uses of the pre-updated
|
|
/// induction variable and uses of the induction varible to compute the
|
|
/// updated induction variable value.
|
|
void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
|
|
|
|
public:
|
|
/// Returns whether this object currently represents the IR of a loop. If
|
|
/// returning false, it may have been consumed by a loop transformation or not
|
|
/// been intialized. Do not use in this case;
|
|
bool isValid() const { return Header; }
|
|
|
|
/// The preheader ensures that there is only a single edge entering the loop.
|
|
/// Code that must be execute before any loop iteration can be emitted here,
|
|
/// such as computing the loop trip count and begin lifetime markers. Code in
|
|
/// the preheader is not considered part of the canonical loop.
|
|
BasicBlock *getPreheader() const;
|
|
|
|
/// The header is the entry for each iteration. In the canonical control flow,
|
|
/// it only contains the PHINode for the induction variable.
|
|
BasicBlock *getHeader() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Header;
|
|
}
|
|
|
|
/// The condition block computes whether there is another loop iteration. If
|
|
/// yes, branches to the body; otherwise to the exit block.
|
|
BasicBlock *getCond() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Cond;
|
|
}
|
|
|
|
/// The body block is the single entry for a loop iteration and not controlled
|
|
/// by CanonicalLoopInfo. It can contain arbitrary control flow but must
|
|
/// eventually branch to the \p Latch block.
|
|
BasicBlock *getBody() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
|
|
}
|
|
|
|
/// Reaching the latch indicates the end of the loop body code. In the
|
|
/// canonical control flow, it only contains the increment of the induction
|
|
/// variable.
|
|
BasicBlock *getLatch() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Latch;
|
|
}
|
|
|
|
/// Reaching the exit indicates no more iterations are being executed.
|
|
BasicBlock *getExit() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Exit;
|
|
}
|
|
|
|
/// The after block is intended for clean-up code such as lifetime end
|
|
/// markers. It is separate from the exit block to ensure, analogous to the
|
|
/// preheader, it having just a single entry edge and being free from PHI
|
|
/// nodes should there be multiple loop exits (such as from break
|
|
/// statements/cancellations).
|
|
BasicBlock *getAfter() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Exit->getSingleSuccessor();
|
|
}
|
|
|
|
/// Returns the llvm::Value containing the number of loop iterations. It must
|
|
/// be valid in the preheader and always interpreted as an unsigned integer of
|
|
/// any bit-width.
|
|
Value *getTripCount() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
Instruction *CmpI = &Cond->front();
|
|
assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
|
|
return CmpI->getOperand(1);
|
|
}
|
|
|
|
/// Returns the instruction representing the current logical induction
|
|
/// variable. Always unsigned, always starting at 0 with an increment of one.
|
|
Instruction *getIndVar() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
Instruction *IndVarPHI = &Header->front();
|
|
assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
|
|
return IndVarPHI;
|
|
}
|
|
|
|
/// Return the type of the induction variable (and the trip count).
|
|
Type *getIndVarType() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return getIndVar()->getType();
|
|
}
|
|
|
|
/// Return the insertion point for user code before the loop.
|
|
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
BasicBlock *Preheader = getPreheader();
|
|
return {Preheader, std::prev(Preheader->end())};
|
|
};
|
|
|
|
/// Return the insertion point for user code in the body.
|
|
OpenMPIRBuilder::InsertPointTy getBodyIP() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
BasicBlock *Body = getBody();
|
|
return {Body, Body->begin()};
|
|
};
|
|
|
|
/// Return the insertion point for user code after the loop.
|
|
OpenMPIRBuilder::InsertPointTy getAfterIP() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
BasicBlock *After = getAfter();
|
|
return {After, After->begin()};
|
|
};
|
|
|
|
Function *getFunction() const {
|
|
assert(isValid() && "Requires a valid canonical loop");
|
|
return Header->getParent();
|
|
}
|
|
|
|
/// Consistency self-check.
|
|
void assertOK() const;
|
|
|
|
/// Invalidate this loop. That is, the underlying IR does not fulfill the
|
|
/// requirements of an OpenMP canonical loop anymore.
|
|
void invalidate();
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
|