From 3a490934bff3b526797abf8f01c9ffc6c660547e Mon Sep 17 00:00:00 2001 From: Daniel Hernandez-Juarez Date: Tue, 17 Sep 2024 16:25:19 +0000 Subject: [PATCH] Refactor Reuse LDS to be able to keep the same heuristic and also revert the enableApplicability change --- .../mlir/Dialect/Rock/utility/memoryUtils.h | 49 +++ mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp | 26 +- .../Dialect/Rock/Transforms/OutputSwizzle.cpp | 44 ++- mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp | 300 ++---------------- mlir/lib/Dialect/Rock/utility/CMakeLists.txt | 1 + mlir/lib/Dialect/Rock/utility/memoryUtils.cpp | 283 +++++++++++++++++ 6 files changed, 398 insertions(+), 305 deletions(-) create mode 100644 mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h create mode 100644 mlir/lib/Dialect/Rock/utility/memoryUtils.cpp diff --git a/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h b/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h new file mode 100644 index 000000000000..deafabe3f50d --- /dev/null +++ b/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h @@ -0,0 +1,49 @@ +#ifndef MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H +#define MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H + +#include "mlir/Dialect/Rock/IR/Rock.h" +#include "llvm/ADT/MapVector.h" + +namespace mlir { +namespace rock { + +struct LDSInfo { + llvm::SmallDenseMap> + interferenceGraph; + SmallVector allocs; + SmallVector deallocs; + llvm::SmallDenseMap> deallocBefore; +}; + +/// Utility function to get workgroup memory size +std::optional getWorkgroupMemorySize(MemRefType type); + +/// Utility function to check if there is enough LDS on the target architecture +LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes); + +/// This is a greedy graph coloring algorithm. +/// There are some changes to make it work for LDS, the main one: +/// each alloc can be assigned more than one color, this is because +/// in graph coloring all vertex are assumed to be the same size +/// (for example, register allocation). +/// Example: A=GpuAllocOp(1kB), B=GpuAllocOp(1kB), C=GpuAllocOp(2kB) +/// A <--> B C (A and B have an edge, C disjoint) +/// In this case, we can assign colors: A -> {0}, B -> {1}, and C -> {0, 1}. +/// Colors 0 and 1 are 1kB each. +/// Note: If an alloc has more than one color assigned, they have to be +/// consecutive. +std::tuple, + SmallVector>> +graphColoring(LDSInfo &ldsInfo); + +/// Utility function to create an interference graph of GPUAllocs and +/// GPUDeallocs +FailureOr createInterferenceGraph(func::FuncOp &func); + +/// Utility function to compute allocated LDS after LDS reuse pass. +FailureOr getAllocatedLDSAfterReuse(func::FuncOp &func); + +} // namespace rock +} // namespace mlir + +#endif // MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp index c66176f60e8f..db4c3314e8ce 100644 --- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp +++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp @@ -156,20 +156,20 @@ void rock::buildKernelPipeline(OpPassManager &pm, funcPm.addPass(rock::createRockBlockwiseGemmToThreadwisePass()); funcPm.addPass(rock::createRockOutputSwizzlePass()); - if (options.enableFusion) { - // align linalg tiling - /* rocmlir-opt --rock-linalg-align --rock-pipeline --canonicalize - * --convert-linalg-to-affine-loops --rock-vectorize-fusions - */ - funcPm.addPass(rock::createRockLinalgAlignPass()); - funcPm.addPass(rock::createRockPipelinePass()); - funcPm.addPass(createCanonicalizerPass()); - funcPm.addPass(createConvertLinalgToAffineLoopsPass()); - funcPm.addPass(rock::createRockVectorizeFusionsPass()); - } - funcPm.addPass(rock::createRockReuseLDSPass()); - if (!options.enableApplicability) { + if (options.enableFusion) { + // align linalg tiling + /* rocmlir-opt --rock-linalg-align --rock-pipeline --canonicalize + * --convert-linalg-to-affine-loops --rock-vectorize-fusions + */ + funcPm.addPass(rock::createRockLinalgAlignPass()); + funcPm.addPass(rock::createRockPipelinePass()); + funcPm.addPass(createCanonicalizerPass()); + funcPm.addPass(createConvertLinalgToAffineLoopsPass()); + funcPm.addPass(rock::createRockVectorizeFusionsPass()); + } + funcPm.addPass(rock::createRockReuseLDSPass()); + // rock lowering for reductions /* rocmlir-opt --rock-lower-reduce */ diff --git a/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp b/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp index 3af02e1d3450..faa16893b8ef 100644 --- a/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp @@ -22,7 +22,6 @@ #include "mlir/Dialect/Rock/IR/Rock.h" #include "mlir/Dialect/Rock/IR/TransformMapBuilder.h" #include "mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h" -#include "mlir/Dialect/Rock/utility/AmdArchDb.h" #include "mlir/Dialect/Rock/utility/loweringUtils.h" #include "mlir/Dialect/Rock/utility/math.h" #include "mlir/Dialect/Rock/utility/transformMapUtils.h" @@ -32,6 +31,7 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Rock/Passes.h" #include "mlir/Dialect/Rock/utility/builderUtils.h" +#include "mlir/Dialect/Rock/utility/memoryUtils.h" #include "mlir/Dialect/Rock/utility/transformMapUtils.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/Dialect/Utils/IndexingUtils.h" @@ -80,17 +80,6 @@ static bool hasGlobalMemoryAddressSpace(MemRefType type) { !hasPrivateMemoryAddressSpace(type); } -static LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) { - // Check for arch limitations exceeded - FailureOr maybeArch = getArch(op); - if (succeeded(maybeArch)) { - StringAttr arch = maybeArch.value(); - const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG; - return success(ldsBytes <= ldsSize); - } - return success(); -} - static std::optional> getIdToLDS(ThreadwiseWriteAllOp &op, OpBuilder &b) { ArrayAttr srcTransform = op.getExtraViewsAttr(); @@ -195,8 +184,9 @@ struct ThreadwiseWriteAllRewritePattern return success(); } size_t extraIdxCount = op.getExtraIndices().size(); - VectorizationResult vectorRes = - getMaxVectorization(destView, extraIdxCount, /*inputDimLen=*/std::nullopt, destView.getDefiningOp()); + VectorizationResult vectorRes = getMaxVectorization( + destView, extraIdxCount, /*inputDimLen=*/std::nullopt, + destView.getDefiningOp()); int64_t originalVectorLen = vectorRes.max; if (vectorLen <= originalVectorLen) { @@ -398,8 +388,24 @@ void RockOutputSwizzlePass::runOnOperation() { if (!func->hasAttr("kernel")) return; + // Get allocated LDS after "reuse LDS" pass + FailureOr maybeAllocatedLDS = getAllocatedLDSAfterReuse(func); + if (failed(maybeAllocatedLDS)) { + LLVM_DEBUG(llvm::dbgs() << "Failed calling getAllocatedLDS\n"); + return signalPassFailure(); + } + int64_t allocatedLDS = maybeAllocatedLDS.value(); + + // not enough LDS memory + if (failed(checkLDSSize(func, allocatedLDS))) { + LLVM_DEBUG(llvm::dbgs() << "We require too much LDS memory: " + << allocatedLDS << " bytes\n"); + return signalPassFailure(); + } + SmallVector writes; - func.walk([&writes, &rewriter](ThreadwiseWriteAllOp threadwiseWriteAll) { + func.walk([&writes, &rewriter, + allocatedLDS](ThreadwiseWriteAllOp threadwiseWriteAll) { MemRefType destMemRefType = cast(threadwiseWriteAll.getDest().getType()); @@ -426,6 +432,14 @@ void RockOutputSwizzlePass::runOnOperation() { << ldsRequiredBytes << " bytes, skipping pass\n"); return; } + // heuristic: if we need more LDS, skip this pass + if (ldsRequiredBytes > allocatedLDS) { + LLVM_DEBUG(llvm::dbgs() + << "OutputSwizzle requires more LDS memory, current usage: " + << allocatedLDS << " bytes, required: " << ldsRequiredBytes + << " bytes, skipping pass\n"); + return; + } writes.push_back(threadwiseWriteAll); } }); diff --git a/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp b/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp index 2e700158086c..24e44de953a5 100644 --- a/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp @@ -20,6 +20,7 @@ //===-----------------------------------------------------===// #include "mlir/Dialect/Rock/utility/AmdArchDb.h" #include "mlir/Dialect/Rock/utility/loweringUtils.h" +#include "mlir/Dialect/Rock/utility/memoryUtils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Rock/Passes.h" @@ -46,296 +47,41 @@ struct RockReuseLDSPass }; } // end anonymous namespace -static std::optional getWorkgroupMemorySize(MemRefType type) { - auto memSpaceValue = - dyn_cast_or_null(type.getMemorySpace()).getValue(); - if (memSpaceValue == gpu::GPUDialect::getWorkgroupAddressSpace()) { - return type.getNumElements() * getByteWidth(type.getElementType()); - } - return std::nullopt; -} - -static LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) { - // Check for arch limitations exceeded - FailureOr maybeArch = getArch(op); - if (succeeded(maybeArch)) { - StringAttr arch = maybeArch.value(); - const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG; - return success(ldsBytes <= ldsSize); - } - return success(); -} - -static void assignColors( - GpuAllocOp alloc, llvm::SetVector &usedColors, - llvm::MapVector &colorMemSize, - llvm::SmallDenseMap> &colorAssignment) { - const std::optional maybeRequiredSize = - getWorkgroupMemorySize(alloc.getOutput().getType()); - assert(maybeRequiredSize.has_value()); - const int64_t requiredSize = maybeRequiredSize.value(); - assert(requiredSize > 0); - int64_t color = 0; - int64_t allocatedSize = 0; - - while (allocatedSize < requiredSize) { - if (usedColors.contains(color)) { - // found a used color, all the assigned colors must be consecutive - // let's start again - allocatedSize = 0; - colorAssignment[alloc].clear(); - // Find the first available color - while (usedColors.contains(color)) { - color++; - } - } - colorAssignment[alloc].insert(color); - // New color - if (!colorMemSize.contains(color)) { - // Make this aligned to 128 bits - colorMemSize[color] = llvm::alignTo(requiredSize - allocatedSize, 16); - } - allocatedSize += colorMemSize[color]; - color++; - } -} - -/// This is a greedy graph coloring algorithm. -/// There are some changes to make it work for LDS, the main one: -/// each alloc can be assigned more than one color, this is because -/// in graph coloring all vertex are assumed to be the same size -/// (for example, register allocation). -/// Example: A=GpuAllocOp(1kB), B=GpuAllocOp(1kB), C=GpuAllocOp(2kB) -/// A <--> B C (A and B have an edge, C disjoint) -/// In this case, we can assign colors: A -> {0}, B -> {1}, and C -> {0, 1}. -/// Colors 0 and 1 are 1kB each. -/// Note: If an alloc has more than one color assigned, they have to be -/// consecutive. -static std::tuple, - SmallVector>> -graphColoring( - SmallVector &gpuAllocs, - llvm::SmallDenseMap> &interferenceGraph, - llvm::SmallDenseMap> &deallocBefore) { - llvm::SmallDenseMap> colorAssignment; - llvm::MapVector colorMemSize; - - SmallVector sortedAllocs(gpuAllocs); - - // Sort by alloc size - llvm::sort(sortedAllocs, [](GpuAllocOp &a, GpuAllocOp &b) { - auto aSize = getWorkgroupMemorySize(a.getOutput().getType()); - auto bSize = getWorkgroupMemorySize(b.getOutput().getType()); - assert(aSize.has_value() && bSize.has_value()); - return aSize.value() < bSize.value(); - }); - // Assign colors using greedy algorithm - for (GpuAllocOp alloc : sortedAllocs) { - llvm::SetVector usedColors; - for (GpuAllocOp neighbor : interferenceGraph[alloc]) { - if (colorAssignment.contains(neighbor)) { - for (int64_t color : colorAssignment[neighbor]) { - usedColors.insert(color); - } - } - } - - // Assign a set of colors - assignColors(alloc, usedColors, colorMemSize, colorAssignment); - } - - // If we replace all GpuAllocOps with a single one, we run into - // aliasing issues that cause performance regressions. - // In order to avoid that, we first merge colors so that each - // GpuAlloc is assigned a single color and an offset. - // Then, we can generate more than one GpuAllocOp and improve - // aliasing issues. - // This might be removed in the future if aliasing issues are solved. - llvm::SmallDenseMap> colorsToMerge; - llvm::SmallDenseMap oldColorToNew; - for (GpuAllocOp alloc : sortedAllocs) { - int64_t firstColor = colorAssignment[alloc][0]; - // if the color has been replaced already - if (oldColorToNew.contains(firstColor)) { - int64_t newColor = oldColorToNew[firstColor]; - // assign all the colors of the current 'alloc' to newColor - for (int64_t color : colorAssignment[alloc]) { - oldColorToNew[color] = newColor; - colorsToMerge[newColor].insert(color); - } - } else { - // the color has not been replaced yet - for (auto [i, color] : llvm::enumerate(colorAssignment[alloc])) { - // merge all non-first colors with the first one - if (i > 0) { - oldColorToNew[color] = firstColor; - colorsToMerge[firstColor].insert(color); - // if the current 'color' has merged some colors, - // merge its colors to 'firstColor' - if (colorsToMerge.contains(color)) { - for (int64_t otherColor : colorsToMerge[color]) { - oldColorToNew[otherColor] = firstColor; - colorsToMerge[firstColor].insert(otherColor); - } - colorsToMerge.erase(color); - } - } - } - } - } - // Compute offsets and new sizes (after merging) - llvm::MapVector mergedColorMemSize(colorMemSize); - llvm::MapVector colorOffset; - for (auto [color, _] : colorMemSize) { - colorOffset[color] = 0; - } - for (auto [color, oldColors] : colorsToMerge) { - for (int64_t oldColor : oldColors) { - assert(oldColor > color); - colorOffset[oldColor] = mergedColorMemSize[color]; - mergedColorMemSize[color] += mergedColorMemSize[oldColor]; - mergedColorMemSize.erase(oldColor); - } - } - - // Compute information per GpuAllocOp - SmallVector> gpuAllocInfo; - llvm::SetVector usedColors; - for (const GpuAllocOp alloc : gpuAllocs) { - assert(colorAssignment.contains(alloc)); - - // if the color has been used, we are "reusing" memory, - // we need a LDS barrier - bool useLDSBarrier = false; - for (int64_t color : colorAssignment[alloc]) { - useLDSBarrier |= usedColors.contains(color); - usedColors.insert(color); - } - - if (useLDSBarrier) { - for (GpuAllocOp deadAlloc : deallocBefore[alloc]) { - for (int64_t color : colorAssignment[deadAlloc]) { - if (!colorAssignment[alloc].contains(color)) { - usedColors.remove(color); - } - } - } - } - - int64_t oldColor = colorAssignment[alloc][0]; - assert(colorOffset.contains(oldColor)); - int64_t offset = colorOffset[oldColor]; - int64_t newColor = - (oldColorToNew.contains(oldColor)) ? oldColorToNew[oldColor] : oldColor; - gpuAllocInfo.push_back(std::tuple(alloc, newColor, offset, useLDSBarrier)); - } - - return std::tuple(mergedColorMemSize, gpuAllocInfo); -} - static LogicalResult reuseLDS(func::FuncOp &func) { - IRRewriter rewriter(func->getContext()); - - SmallVector allocs; - SmallVector deallocs; - SetVector currentAllocs; - llvm::SmallDenseMap> - interferenceGraph; - llvm::SmallDenseMap memrefToAlloc; - llvm::SmallDenseMap> deallocBefore; - llvm::SetVector deallocsUpToNow; - - // Create the interference graph and save all allocs and deallocs (LDS) - WalkResult walkResult = func.walk([&](Operation *op) -> WalkResult { - if (auto gpuAlloc = dyn_cast(op)) { - auto type = gpuAlloc.getOutput().getType(); - - std::optional maybeSize = getWorkgroupMemorySize(type); - if (maybeSize.has_value()) { - int64_t size = maybeSize.value(); - LLVM_DEBUG(llvm::dbgs() - << "Found rock.alloc of " << size << " bytes\n"); - - // save deallocs up to this point - deallocBefore[gpuAlloc] = SetVector(deallocsUpToNow.begin(), - deallocsUpToNow.end()); - deallocsUpToNow.clear(); - - // add vertex and connections - for (auto alloc : currentAllocs) { - interferenceGraph[alloc].insert(gpuAlloc); - interferenceGraph[gpuAlloc].insert(alloc); - } - // if it has no neighbors, we still want to add a vertex - if (currentAllocs.empty()) { - interferenceGraph[gpuAlloc] = {}; - } - currentAllocs.insert(gpuAlloc); - memrefToAlloc[gpuAlloc.getOutput()] = gpuAlloc; - allocs.push_back(gpuAlloc); - } - } else if (auto gpuDealloc = dyn_cast(op)) { - auto type = gpuDealloc.getMemref().getType(); - std::optional maybeSize = getWorkgroupMemorySize(type); - if (maybeSize.has_value()) { - int64_t size = maybeSize.value(); - LLVM_DEBUG(llvm::dbgs() - << "Found rock.dealloc of " << size << " bytes\n"); - - if (memrefToAlloc.find(gpuDealloc.getMemref()) == memrefToAlloc.end()) { - LLVM_DEBUG(llvm::dbgs() << "Called rock.dealloc multiple times?\n"); - return WalkResult::interrupt(); - } - bool erased = - currentAllocs.remove(memrefToAlloc[gpuDealloc.getMemref()]); - deallocsUpToNow.insert(memrefToAlloc[gpuDealloc.getMemref()]); - if (!erased) { - LLVM_DEBUG(llvm::dbgs() << "Called rock.dealloc multiple times?\n"); - return WalkResult::interrupt(); - } - deallocs.push_back(gpuDealloc); - } - } - return WalkResult::advance(); - }); - - if (walkResult.wasInterrupted()) { - LLVM_DEBUG(llvm::dbgs() << "Walk interrupted\n"); + FailureOr maybeLdsInfo = createInterferenceGraph(func); + if (failed(maybeLdsInfo)) { return failure(); } + LDSInfo ldsInfo = maybeLdsInfo.value(); - // same number of rock.alloc and rock.dealloc - if (deallocs.size() != allocs.size() || - allocs.size() != interferenceGraph.size() || !currentAllocs.empty()) { - LLVM_DEBUG(llvm::dbgs() << "There should be an equal number of rock.alloc " - "and rock.dealloc (for LDS)\n"); - return failure(); + // add debug information + for (GpuAllocOp alloc : ldsInfo.allocs) { + auto type = alloc.getOutput().getType(); + std::optional maybeSize = getWorkgroupMemorySize(type); + assert(maybeSize.has_value()); + int64_t size = maybeSize.value(); + LLVM_DEBUG(llvm::dbgs() << "Found rock.alloc of " << size << " bytes\n"); + } + for (GpuDeallocOp dealloc : ldsInfo.deallocs) { + auto type = dealloc.getMemref().getType(); + std::optional maybeSize = getWorkgroupMemorySize(type); + assert(maybeSize.has_value()); + int64_t size = maybeSize.value(); + LLVM_DEBUG(llvm::dbgs() << "Found rock.dealloc of " << size << " bytes\n"); } // nothing to do if there is only one (or none) LDS allocation - if (interferenceGraph.size() < 2) { + if (ldsInfo.interferenceGraph.size() < 2) { LLVM_DEBUG(llvm::dbgs() << "Not enough LDS allocations, skipping pass\n"); return success(); } llvm::MapVector colorSizes; SmallVector> allocOffsets; - std::tie(colorSizes, allocOffsets) = - graphColoring(allocs, interferenceGraph, deallocBefore); - - int64_t requiredMemory = 0; - for (auto [_, size] : colorSizes) { - requiredMemory += size; - } - - // not enough LDS memory - if (failed(checkLDSSize(func, requiredMemory))) { - LLVM_DEBUG(llvm::dbgs() << "ReuseLDS requires too much LDS memory: " - << requiredMemory << " bytes\n"); - return failure(); - } + std::tie(colorSizes, allocOffsets) = graphColoring(ldsInfo); // write the new GpuAllocOp to the start + IRRewriter rewriter(func->getContext()); rewriter.setInsertionPointToStart(&func.getBody().front()); // New allocations @@ -407,9 +153,9 @@ static LogicalResult reuseLDS(func::FuncOp &func) { // Remove all GpuDeallocOps but the last one and add a new alloc/dealloc pair // for each buffer - for (auto [i, dealloc] : llvm::enumerate(deallocs)) { + for (auto [i, dealloc] : llvm::enumerate(ldsInfo.deallocs)) { rewriter.setInsertionPointAfter(dealloc); - if (i == deallocs.size() - 1) { + if (i == ldsInfo.deallocs.size() - 1) { GpuDeallocOp prevDealloc = rewriter.replaceOpWithNewOp( dealloc, std::get<1>(colorAllocs.front())); for (auto [i, pair] : llvm::enumerate(colorAllocs)) { diff --git a/mlir/lib/Dialect/Rock/utility/CMakeLists.txt b/mlir/lib/Dialect/Rock/utility/CMakeLists.txt index f0b834c71abb..423822d309ea 100644 --- a/mlir/lib/Dialect/Rock/utility/CMakeLists.txt +++ b/mlir/lib/Dialect/Rock/utility/CMakeLists.txt @@ -5,6 +5,7 @@ add_rocmlir_dialect_library(MLIRRockUtility loweringUtils.cpp transformMapUtils.cpp fusionUtils.cpp + memoryUtils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Rock diff --git a/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp b/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp new file mode 100644 index 000000000000..f577c4fba3fa --- /dev/null +++ b/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp @@ -0,0 +1,283 @@ +//===- memoryUtils.cpp - Rock memory utility functions +//---------------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------===// + +#include "mlir/Dialect/Rock/utility/memoryUtils.h" + +#include "mlir/Dialect/Rock/utility/AmdArchDb.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Rock/utility/builderUtils.h" +#include "mlir/Dialect/Rock/utility/loweringUtils.h" + +namespace mlir { +namespace rock { + +std::optional getWorkgroupMemorySize(MemRefType type) { + auto memSpaceValue = + dyn_cast_or_null(type.getMemorySpace()).getValue(); + if (memSpaceValue == gpu::GPUDialect::getWorkgroupAddressSpace()) { + return type.getNumElements() * getByteWidth(type.getElementType()); + } + return std::nullopt; +} + +static void assignColors( + GpuAllocOp alloc, llvm::SetVector &usedColors, + llvm::MapVector &colorMemSize, + llvm::SmallDenseMap> &colorAssignment) { + const std::optional maybeRequiredSize = + getWorkgroupMemorySize(alloc.getOutput().getType()); + assert(maybeRequiredSize.has_value()); + const int64_t requiredSize = maybeRequiredSize.value(); + assert(requiredSize > 0); + int64_t color = 0; + int64_t allocatedSize = 0; + + while (allocatedSize < requiredSize) { + if (usedColors.contains(color)) { + // found a used color, all the assigned colors must be consecutive + // let's start again + allocatedSize = 0; + colorAssignment[alloc].clear(); + // Find the first available color + while (usedColors.contains(color)) { + color++; + } + } + colorAssignment[alloc].insert(color); + // New color + if (!colorMemSize.contains(color)) { + // Make this aligned to 128 bits + colorMemSize[color] = llvm::alignTo(requiredSize - allocatedSize, 16); + } + allocatedSize += colorMemSize[color]; + color++; + } +} + +LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) { + // Check for arch limitations exceeded + FailureOr maybeArch = getArch(op); + if (succeeded(maybeArch)) { + StringAttr arch = maybeArch.value(); + const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG; + return success(ldsBytes <= ldsSize); + } + return success(); +} + +std::tuple, + SmallVector>> +graphColoring(LDSInfo &ldsInfo) { + llvm::SmallDenseMap> colorAssignment; + llvm::MapVector colorMemSize; + + SmallVector sortedAllocs(ldsInfo.allocs); + + // Sort by alloc size + llvm::sort(sortedAllocs, [](GpuAllocOp &a, GpuAllocOp &b) { + auto aSize = getWorkgroupMemorySize(a.getOutput().getType()); + auto bSize = getWorkgroupMemorySize(b.getOutput().getType()); + assert(aSize.has_value() && bSize.has_value()); + return aSize.value() < bSize.value(); + }); + // Assign colors using greedy algorithm + for (GpuAllocOp alloc : sortedAllocs) { + llvm::SetVector usedColors; + for (GpuAllocOp neighbor : ldsInfo.interferenceGraph[alloc]) { + if (colorAssignment.contains(neighbor)) { + for (int64_t color : colorAssignment[neighbor]) { + usedColors.insert(color); + } + } + } + + // Assign a set of colors + assignColors(alloc, usedColors, colorMemSize, colorAssignment); + } + + // If we replace all GpuAllocOps with a single one, we run into + // aliasing issues that cause performance regressions. + // In order to avoid that, we first merge colors so that each + // GpuAlloc is assigned a single color and an offset. + // Then, we can generate more than one GpuAllocOp and improve + // aliasing issues. + // This might be removed in the future if aliasing issues are solved. + llvm::SmallDenseMap> colorsToMerge; + llvm::SmallDenseMap oldColorToNew; + for (GpuAllocOp alloc : sortedAllocs) { + int64_t firstColor = colorAssignment[alloc][0]; + // if the color has been replaced already + if (oldColorToNew.contains(firstColor)) { + int64_t newColor = oldColorToNew[firstColor]; + // assign all the colors of the current 'alloc' to newColor + for (int64_t color : colorAssignment[alloc]) { + oldColorToNew[color] = newColor; + colorsToMerge[newColor].insert(color); + } + } else { + // the color has not been replaced yet + for (auto [i, color] : llvm::enumerate(colorAssignment[alloc])) { + // merge all non-first colors with the first one + if (i > 0) { + oldColorToNew[color] = firstColor; + colorsToMerge[firstColor].insert(color); + // if the current 'color' has merged some colors, + // merge its colors to 'firstColor' + if (colorsToMerge.contains(color)) { + for (int64_t otherColor : colorsToMerge[color]) { + oldColorToNew[otherColor] = firstColor; + colorsToMerge[firstColor].insert(otherColor); + } + colorsToMerge.erase(color); + } + } + } + } + } + // Compute offsets and new sizes (after merging) + llvm::MapVector mergedColorMemSize(colorMemSize); + llvm::MapVector colorOffset; + for (auto [color, _] : colorMemSize) { + colorOffset[color] = 0; + } + for (auto [color, oldColors] : colorsToMerge) { + for (int64_t oldColor : oldColors) { + assert(oldColor > color); + colorOffset[oldColor] = mergedColorMemSize[color]; + mergedColorMemSize[color] += mergedColorMemSize[oldColor]; + mergedColorMemSize.erase(oldColor); + } + } + + // Compute information per GpuAllocOp + SmallVector> gpuAllocInfo; + llvm::SetVector usedColors; + for (const GpuAllocOp alloc : ldsInfo.allocs) { + assert(colorAssignment.contains(alloc)); + + // if the color has been used, we are "reusing" memory, + // we need a LDS barrier + bool useLDSBarrier = false; + for (int64_t color : colorAssignment[alloc]) { + useLDSBarrier |= usedColors.contains(color); + usedColors.insert(color); + } + + if (useLDSBarrier) { + for (GpuAllocOp deadAlloc : ldsInfo.deallocBefore[alloc]) { + for (int64_t color : colorAssignment[deadAlloc]) { + if (!colorAssignment[alloc].contains(color)) { + usedColors.remove(color); + } + } + } + } + + int64_t oldColor = colorAssignment[alloc][0]; + assert(colorOffset.contains(oldColor)); + int64_t offset = colorOffset[oldColor]; + int64_t newColor = + (oldColorToNew.contains(oldColor)) ? oldColorToNew[oldColor] : oldColor; + gpuAllocInfo.push_back(std::tuple(alloc, newColor, offset, useLDSBarrier)); + } + + return std::tuple(mergedColorMemSize, gpuAllocInfo); +} + +FailureOr createInterferenceGraph(func::FuncOp &func) { + LDSInfo ldsInfo; + SetVector currentAllocs; + llvm::SmallDenseMap memrefToAlloc; + llvm::SetVector deallocsUpToNow; + + // Create the interference graph and save all allocs and deallocs (LDS) + WalkResult walkResult = func.walk([&](Operation *op) -> WalkResult { + if (auto gpuAlloc = dyn_cast(op)) { + auto type = gpuAlloc.getOutput().getType(); + + std::optional maybeSize = getWorkgroupMemorySize(type); + if (maybeSize.has_value()) { + // save deallocs up to this point + ldsInfo.deallocBefore[gpuAlloc] = SetVector( + deallocsUpToNow.begin(), deallocsUpToNow.end()); + deallocsUpToNow.clear(); + + // add vertex and connections + for (auto alloc : currentAllocs) { + ldsInfo.interferenceGraph[alloc].insert(gpuAlloc); + ldsInfo.interferenceGraph[gpuAlloc].insert(alloc); + } + // if it has no neighbors, we still want to add a vertex + if (currentAllocs.empty()) { + ldsInfo.interferenceGraph[gpuAlloc] = {}; + } + currentAllocs.insert(gpuAlloc); + memrefToAlloc[gpuAlloc.getOutput()] = gpuAlloc; + ldsInfo.allocs.push_back(gpuAlloc); + } + } else if (auto gpuDealloc = dyn_cast(op)) { + auto type = gpuDealloc.getMemref().getType(); + std::optional maybeSize = getWorkgroupMemorySize(type); + if (maybeSize.has_value()) { + if (memrefToAlloc.find(gpuDealloc.getMemref()) == memrefToAlloc.end()) { + return WalkResult::interrupt(); + } + bool erased = + currentAllocs.remove(memrefToAlloc[gpuDealloc.getMemref()]); + deallocsUpToNow.insert(memrefToAlloc[gpuDealloc.getMemref()]); + if (!erased) { + return WalkResult::interrupt(); + } + ldsInfo.deallocs.push_back(gpuDealloc); + } + } + return WalkResult::advance(); + }); + + if (walkResult.wasInterrupted()) { + if (ldsInfo.allocs.empty()) + return emitError(UnknownLoc::get(func.getContext()), "Unexpected error"); + return ldsInfo.allocs.front().emitError( + "Called rock.dealloc multiple times"); + } + + // same number of rock.alloc and rock.dealloc + if (ldsInfo.deallocs.size() != ldsInfo.allocs.size() || + ldsInfo.allocs.size() != ldsInfo.interferenceGraph.size() || + !currentAllocs.empty()) { + return emitError(UnknownLoc::get(func.getContext()), + "There should be an equal number of rock.alloc and " + "rock.dealloc (for LDS)"); + } + + return ldsInfo; +} + +FailureOr getAllocatedLDSAfterReuse(func::FuncOp &func) { + FailureOr maybeLdsInfo = createInterferenceGraph(func); + if (failed(maybeLdsInfo)) { + return failure(); + } + LDSInfo ldsInfo = maybeLdsInfo.value(); + + llvm::MapVector colorSizes; + SmallVector> allocOffsets; + std::tie(colorSizes, allocOffsets) = graphColoring(ldsInfo); + + int64_t requiredMemory = 0; + for (auto [_, size] : colorSizes) { + requiredMemory += size; + } + + return requiredMemory; +} + +} // namespace rock +} // namespace mlir \ No newline at end of file