From 3a490934bff3b526797abf8f01c9ffc6c660547e Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <dhernandez0@gmail.com>
Date: Tue, 17 Sep 2024 16:25:19 +0000
Subject: [PATCH] Refactor Reuse LDS to be able to keep the same heuristic and
 also revert the enableApplicability change

---
 .../mlir/Dialect/Rock/utility/memoryUtils.h   |  49 +++
 mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp |  26 +-
 .../Dialect/Rock/Transforms/OutputSwizzle.cpp |  44 ++-
 mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp | 300 ++----------------
 mlir/lib/Dialect/Rock/utility/CMakeLists.txt  |   1 +
 mlir/lib/Dialect/Rock/utility/memoryUtils.cpp | 283 +++++++++++++++++
 6 files changed, 398 insertions(+), 305 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h
 create mode 100644 mlir/lib/Dialect/Rock/utility/memoryUtils.cpp
diff --git a/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h b/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h
new file mode 100644
index 000000000000..deafabe3f50d
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Rock/utility/memoryUtils.h
@@ -0,0 +1,49 @@
+#ifndef MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H
+#define MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H
+
+#include "mlir/Dialect/Rock/IR/Rock.h"
+#include "llvm/ADT/MapVector.h"
+
+namespace mlir {
+namespace rock {
+
+struct LDSInfo {
+  llvm::SmallDenseMap<GpuAllocOp, llvm::SetVector<GpuAllocOp>>
+      interferenceGraph;
+  SmallVector<GpuAllocOp> allocs;
+  SmallVector<GpuDeallocOp> deallocs;
+  llvm::SmallDenseMap<GpuAllocOp, llvm::SetVector<GpuAllocOp>> deallocBefore;
+};
+
+/// Utility function to get workgroup memory size
+std::optional<int64_t> getWorkgroupMemorySize(MemRefType type);
+
+/// Utility function to check if there is enough LDS on the target architecture
+LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes);
+
+/// This is a greedy graph coloring algorithm.
+/// There are some changes to make it work for LDS, the main one:
+/// each alloc can be assigned more than one color, this is because
+/// in graph coloring all vertex are assumed to be the same size
+/// (for example, register allocation).
+/// Example: A=GpuAllocOp(1kB), B=GpuAllocOp(1kB), C=GpuAllocOp(2kB)
+///           A <--> B     C (A and B have an edge, C disjoint)
+/// In this case, we can assign colors: A -> {0}, B -> {1}, and C -> {0, 1}.
+/// Colors 0 and 1 are 1kB each.
+/// Note: If an alloc has more than one color assigned, they have to be
+///       consecutive.
+std::tuple<llvm::MapVector<int64_t, int64_t>,
+           SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>>>
+graphColoring(LDSInfo &ldsInfo);
+
+/// Utility function to create an interference graph of GPUAllocs and
+/// GPUDeallocs
+FailureOr<LDSInfo> createInterferenceGraph(func::FuncOp &func);
+
+/// Utility function to compute allocated LDS after LDS reuse pass.
+FailureOr<int64_t> getAllocatedLDSAfterReuse(func::FuncOp &func);
+
+} // namespace rock
+} // namespace mlir
+
+#endif // MLIR_DIALECT_ROCK_UTILITY_MEMORYUTILS_H
diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
index c66176f60e8f..db4c3314e8ce 100644
--- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
+++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
@@ -156,20 +156,20 @@ void rock::buildKernelPipeline(OpPassManager &pm,
   funcPm.addPass(rock::createRockBlockwiseGemmToThreadwisePass());
   funcPm.addPass(rock::createRockOutputSwizzlePass());
 
-  if (options.enableFusion) {
-    // align linalg tiling
-    /* rocmlir-opt --rock-linalg-align --rock-pipeline --canonicalize
-     * --convert-linalg-to-affine-loops --rock-vectorize-fusions
-     */
-    funcPm.addPass(rock::createRockLinalgAlignPass());
-    funcPm.addPass(rock::createRockPipelinePass());
-    funcPm.addPass(createCanonicalizerPass());
-    funcPm.addPass(createConvertLinalgToAffineLoopsPass());
-    funcPm.addPass(rock::createRockVectorizeFusionsPass());
-  }
-  funcPm.addPass(rock::createRockReuseLDSPass());
-
   if (!options.enableApplicability) {
+    if (options.enableFusion) {
+      // align linalg tiling
+      /* rocmlir-opt --rock-linalg-align --rock-pipeline --canonicalize
+       * --convert-linalg-to-affine-loops --rock-vectorize-fusions
+       */
+      funcPm.addPass(rock::createRockLinalgAlignPass());
+      funcPm.addPass(rock::createRockPipelinePass());
+      funcPm.addPass(createCanonicalizerPass());
+      funcPm.addPass(createConvertLinalgToAffineLoopsPass());
+      funcPm.addPass(rock::createRockVectorizeFusionsPass());
+    }
+    funcPm.addPass(rock::createRockReuseLDSPass());
+
     // rock lowering for reductions
     /* rocmlir-opt --rock-lower-reduce
      */
diff --git a/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp b/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp
index 3af02e1d3450..faa16893b8ef 100644
--- a/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp
+++ b/mlir/lib/Dialect/Rock/Transforms/OutputSwizzle.cpp
@@ -22,7 +22,6 @@
 #include "mlir/Dialect/Rock/IR/Rock.h"
 #include "mlir/Dialect/Rock/IR/TransformMapBuilder.h"
 #include "mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h"
-#include "mlir/Dialect/Rock/utility/AmdArchDb.h"
 #include "mlir/Dialect/Rock/utility/loweringUtils.h"
 #include "mlir/Dialect/Rock/utility/math.h"
 #include "mlir/Dialect/Rock/utility/transformMapUtils.h"
@@ -32,6 +31,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Rock/Passes.h"
 #include "mlir/Dialect/Rock/utility/builderUtils.h"
+#include "mlir/Dialect/Rock/utility/memoryUtils.h"
 #include "mlir/Dialect/Rock/utility/transformMapUtils.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -80,17 +80,6 @@ static bool hasGlobalMemoryAddressSpace(MemRefType type) {
          !hasPrivateMemoryAddressSpace(type);
 }
 
-static LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) {
-  // Check for arch limitations exceeded
-  FailureOr<StringAttr> maybeArch = getArch(op);
-  if (succeeded(maybeArch)) {
-    StringAttr arch = maybeArch.value();
-    const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG;
-    return success(ldsBytes <= ldsSize);
-  }
-  return success();
-}
-
 static std::optional<std::tuple<int64_t, int64_t, ArrayAttr>>
 getIdToLDS(ThreadwiseWriteAllOp &op, OpBuilder &b) {
   ArrayAttr srcTransform = op.getExtraViewsAttr();
@@ -195,8 +184,9 @@ struct ThreadwiseWriteAllRewritePattern
       return success();
     }
     size_t extraIdxCount = op.getExtraIndices().size();
-    VectorizationResult vectorRes =
-        getMaxVectorization(destView, extraIdxCount, /*inputDimLen=*/std::nullopt, destView.getDefiningOp());
+    VectorizationResult vectorRes = getMaxVectorization(
+        destView, extraIdxCount, /*inputDimLen=*/std::nullopt,
+        destView.getDefiningOp());
     int64_t originalVectorLen = vectorRes.max;
 
     if (vectorLen <= originalVectorLen) {
@@ -398,8 +388,24 @@ void RockOutputSwizzlePass::runOnOperation() {
   if (!func->hasAttr("kernel"))
     return;
 
+  // Get allocated LDS after "reuse LDS" pass
+  FailureOr<int64_t> maybeAllocatedLDS = getAllocatedLDSAfterReuse(func);
+  if (failed(maybeAllocatedLDS)) {
+    LLVM_DEBUG(llvm::dbgs() << "Failed calling getAllocatedLDS\n");
+    return signalPassFailure();
+  }
+  int64_t allocatedLDS = maybeAllocatedLDS.value();
+
+  // not enough LDS memory
+  if (failed(checkLDSSize(func, allocatedLDS))) {
+    LLVM_DEBUG(llvm::dbgs() << "We require too much LDS memory: "
+                            << allocatedLDS << " bytes\n");
+    return signalPassFailure();
+  }
+
   SmallVector<Operation *, 4> writes;
-  func.walk([&writes, &rewriter](ThreadwiseWriteAllOp threadwiseWriteAll) {
+  func.walk([&writes, &rewriter,
+             allocatedLDS](ThreadwiseWriteAllOp threadwiseWriteAll) {
     MemRefType destMemRefType =
         cast<MemRefType>(threadwiseWriteAll.getDest().getType());
 
@@ -426,6 +432,14 @@ void RockOutputSwizzlePass::runOnOperation() {
                    << ldsRequiredBytes << " bytes, skipping pass\n");
         return;
       }
+      // heuristic: if we need more LDS, skip this pass
+      if (ldsRequiredBytes > allocatedLDS) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "OutputSwizzle requires more LDS memory, current usage: "
+                   << allocatedLDS << " bytes, required: " << ldsRequiredBytes
+                   << " bytes, skipping pass\n");
+        return;
+      }
       writes.push_back(threadwiseWriteAll);
     }
   });
diff --git a/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp b/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp
index 2e700158086c..24e44de953a5 100644
--- a/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp
+++ b/mlir/lib/Dialect/Rock/Transforms/ReuseLDS.cpp
@@ -20,6 +20,7 @@
 //===-----------------------------------------------------===//
 #include "mlir/Dialect/Rock/utility/AmdArchDb.h"
 #include "mlir/Dialect/Rock/utility/loweringUtils.h"
+#include "mlir/Dialect/Rock/utility/memoryUtils.h"
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Rock/Passes.h"
@@ -46,296 +47,41 @@ struct RockReuseLDSPass
 };
 } // end anonymous namespace
 
-static std::optional<int64_t> getWorkgroupMemorySize(MemRefType type) {
-  auto memSpaceValue =
-      dyn_cast_or_null<gpu::AddressSpaceAttr>(type.getMemorySpace()).getValue();
-  if (memSpaceValue == gpu::GPUDialect::getWorkgroupAddressSpace()) {
-    return type.getNumElements() * getByteWidth(type.getElementType());
-  }
-  return std::nullopt;
-}
-
-static LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) {
-  // Check for arch limitations exceeded
-  FailureOr<StringAttr> maybeArch = getArch(op);
-  if (succeeded(maybeArch)) {
-    StringAttr arch = maybeArch.value();
-    const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG;
-    return success(ldsBytes <= ldsSize);
-  }
-  return success();
-}
-
-static void assignColors(
-    GpuAllocOp alloc, llvm::SetVector<int64_t> &usedColors,
-    llvm::MapVector<int64_t, int64_t> &colorMemSize,
-    llvm::SmallDenseMap<GpuAllocOp, SetVector<int64_t>> &colorAssignment) {
-  const std::optional<int64_t> maybeRequiredSize =
-      getWorkgroupMemorySize(alloc.getOutput().getType());
-  assert(maybeRequiredSize.has_value());
-  const int64_t requiredSize = maybeRequiredSize.value();
-  assert(requiredSize > 0);
-  int64_t color = 0;
-  int64_t allocatedSize = 0;
-
-  while (allocatedSize < requiredSize) {
-    if (usedColors.contains(color)) {
-      // found a used color, all the assigned colors must be consecutive
-      // let's start again
-      allocatedSize = 0;
-      colorAssignment[alloc].clear();
-      // Find the first available color
-      while (usedColors.contains(color)) {
-        color++;
-      }
-    }
-    colorAssignment[alloc].insert(color);
-    // New color
-    if (!colorMemSize.contains(color)) {
-      // Make this aligned to 128 bits
-      colorMemSize[color] = llvm::alignTo(requiredSize - allocatedSize, 16);
-    }
-    allocatedSize += colorMemSize[color];
-    color++;
-  }
-}
-
-/// This is a greedy graph coloring algorithm.
-/// There are some changes to make it work for LDS, the main one:
-/// each alloc can be assigned more than one color, this is because
-/// in graph coloring all vertex are assumed to be the same size
-/// (for example, register allocation).
-/// Example: A=GpuAllocOp(1kB), B=GpuAllocOp(1kB), C=GpuAllocOp(2kB)
-///           A <--> B     C (A and B have an edge, C disjoint)
-/// In this case, we can assign colors: A -> {0}, B -> {1}, and C -> {0, 1}.
-/// Colors 0 and 1 are 1kB each.
-/// Note: If an alloc has more than one color assigned, they have to be
-///       consecutive.
-static std::tuple<llvm::MapVector<int64_t, int64_t>,
-                  SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>>>
-graphColoring(
-    SmallVector<GpuAllocOp> &gpuAllocs,
-    llvm::SmallDenseMap<GpuAllocOp, SetVector<GpuAllocOp>> &interferenceGraph,
-    llvm::SmallDenseMap<GpuAllocOp, SetVector<GpuAllocOp>> &deallocBefore) {
-  llvm::SmallDenseMap<GpuAllocOp, SetVector<int64_t>> colorAssignment;
-  llvm::MapVector<int64_t, int64_t> colorMemSize;
-
-  SmallVector<GpuAllocOp> sortedAllocs(gpuAllocs);
-
-  // Sort by alloc size
-  llvm::sort(sortedAllocs, [](GpuAllocOp &a, GpuAllocOp &b) {
-    auto aSize = getWorkgroupMemorySize(a.getOutput().getType());
-    auto bSize = getWorkgroupMemorySize(b.getOutput().getType());
-    assert(aSize.has_value() && bSize.has_value());
-    return aSize.value() < bSize.value();
-  });
-  // Assign colors using greedy algorithm
-  for (GpuAllocOp alloc : sortedAllocs) {
-    llvm::SetVector<int64_t> usedColors;
-    for (GpuAllocOp neighbor : interferenceGraph[alloc]) {
-      if (colorAssignment.contains(neighbor)) {
-        for (int64_t color : colorAssignment[neighbor]) {
-          usedColors.insert(color);
-        }
-      }
-    }
-
-    // Assign a set of colors
-    assignColors(alloc, usedColors, colorMemSize, colorAssignment);
-  }
-
-  // If we replace all GpuAllocOps with a single one, we run into
-  // aliasing issues that cause performance regressions.
-  // In order to avoid that, we first merge colors so that each
-  // GpuAlloc is assigned a single color and an offset.
-  // Then, we can generate more than one GpuAllocOp and improve
-  // aliasing issues.
-  // This might be removed in the future if aliasing issues are solved.
-  llvm::SmallDenseMap<int64_t, SetVector<int64_t>> colorsToMerge;
-  llvm::SmallDenseMap<int64_t, int64_t> oldColorToNew;
-  for (GpuAllocOp alloc : sortedAllocs) {
-    int64_t firstColor = colorAssignment[alloc][0];
-    // if the color has been replaced already
-    if (oldColorToNew.contains(firstColor)) {
-      int64_t newColor = oldColorToNew[firstColor];
-      // assign all the colors of the current 'alloc' to newColor
-      for (int64_t color : colorAssignment[alloc]) {
-        oldColorToNew[color] = newColor;
-        colorsToMerge[newColor].insert(color);
-      }
-    } else {
-      // the color has not been replaced yet
-      for (auto [i, color] : llvm::enumerate(colorAssignment[alloc])) {
-        // merge all non-first colors with the first one
-        if (i > 0) {
-          oldColorToNew[color] = firstColor;
-          colorsToMerge[firstColor].insert(color);
-          // if the current 'color' has merged some colors,
-          // merge its colors to 'firstColor'
-          if (colorsToMerge.contains(color)) {
-            for (int64_t otherColor : colorsToMerge[color]) {
-              oldColorToNew[otherColor] = firstColor;
-              colorsToMerge[firstColor].insert(otherColor);
-            }
-            colorsToMerge.erase(color);
-          }
-        }
-      }
-    }
-  }
-  // Compute offsets and new sizes (after merging)
-  llvm::MapVector<int64_t, int64_t> mergedColorMemSize(colorMemSize);
-  llvm::MapVector<int64_t, int64_t> colorOffset;
-  for (auto [color, _] : colorMemSize) {
-    colorOffset[color] = 0;
-  }
-  for (auto [color, oldColors] : colorsToMerge) {
-    for (int64_t oldColor : oldColors) {
-      assert(oldColor > color);
-      colorOffset[oldColor] = mergedColorMemSize[color];
-      mergedColorMemSize[color] += mergedColorMemSize[oldColor];
-      mergedColorMemSize.erase(oldColor);
-    }
-  }
-
-  // Compute information per GpuAllocOp
-  SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>> gpuAllocInfo;
-  llvm::SetVector<int64_t> usedColors;
-  for (const GpuAllocOp alloc : gpuAllocs) {
-    assert(colorAssignment.contains(alloc));
-
-    // if the color has been used, we are "reusing" memory,
-    // we need a LDS barrier
-    bool useLDSBarrier = false;
-    for (int64_t color : colorAssignment[alloc]) {
-      useLDSBarrier |= usedColors.contains(color);
-      usedColors.insert(color);
-    }
-
-    if (useLDSBarrier) {
-      for (GpuAllocOp deadAlloc : deallocBefore[alloc]) {
-        for (int64_t color : colorAssignment[deadAlloc]) {
-          if (!colorAssignment[alloc].contains(color)) {
-            usedColors.remove(color);
-          }
-        }
-      }
-    }
-
-    int64_t oldColor = colorAssignment[alloc][0];
-    assert(colorOffset.contains(oldColor));
-    int64_t offset = colorOffset[oldColor];
-    int64_t newColor =
-        (oldColorToNew.contains(oldColor)) ? oldColorToNew[oldColor] : oldColor;
-    gpuAllocInfo.push_back(std::tuple(alloc, newColor, offset, useLDSBarrier));
-  }
-
-  return std::tuple(mergedColorMemSize, gpuAllocInfo);
-}
-
 static LogicalResult reuseLDS(func::FuncOp &func) {
-  IRRewriter rewriter(func->getContext());
-
-  SmallVector<GpuAllocOp> allocs;
-  SmallVector<GpuDeallocOp> deallocs;
-  SetVector<GpuAllocOp> currentAllocs;
-  llvm::SmallDenseMap<GpuAllocOp, llvm::SetVector<GpuAllocOp>>
-      interferenceGraph;
-  llvm::SmallDenseMap<Value, GpuAllocOp> memrefToAlloc;
-  llvm::SmallDenseMap<GpuAllocOp, llvm::SetVector<GpuAllocOp>> deallocBefore;
-  llvm::SetVector<GpuAllocOp> deallocsUpToNow;
-
-  // Create the interference graph and save all allocs and deallocs (LDS)
-  WalkResult walkResult = func.walk([&](Operation *op) -> WalkResult {
-    if (auto gpuAlloc = dyn_cast<GpuAllocOp>(op)) {
-      auto type = gpuAlloc.getOutput().getType();
-
-      std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
-      if (maybeSize.has_value()) {
-        int64_t size = maybeSize.value();
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Found rock.alloc of " << size << " bytes\n");
-
-        // save deallocs up to this point
-        deallocBefore[gpuAlloc] = SetVector<GpuAllocOp>(deallocsUpToNow.begin(),
-                                                        deallocsUpToNow.end());
-        deallocsUpToNow.clear();
-
-        // add vertex and connections
-        for (auto alloc : currentAllocs) {
-          interferenceGraph[alloc].insert(gpuAlloc);
-          interferenceGraph[gpuAlloc].insert(alloc);
-        }
-        // if it has no neighbors, we still want to add a vertex
-        if (currentAllocs.empty()) {
-          interferenceGraph[gpuAlloc] = {};
-        }
-        currentAllocs.insert(gpuAlloc);
-        memrefToAlloc[gpuAlloc.getOutput()] = gpuAlloc;
-        allocs.push_back(gpuAlloc);
-      }
-    } else if (auto gpuDealloc = dyn_cast<GpuDeallocOp>(op)) {
-      auto type = gpuDealloc.getMemref().getType();
-      std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
-      if (maybeSize.has_value()) {
-        int64_t size = maybeSize.value();
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Found rock.dealloc of " << size << " bytes\n");
-
-        if (memrefToAlloc.find(gpuDealloc.getMemref()) == memrefToAlloc.end()) {
-          LLVM_DEBUG(llvm::dbgs() << "Called rock.dealloc multiple times?\n");
-          return WalkResult::interrupt();
-        }
-        bool erased =
-            currentAllocs.remove(memrefToAlloc[gpuDealloc.getMemref()]);
-        deallocsUpToNow.insert(memrefToAlloc[gpuDealloc.getMemref()]);
-        if (!erased) {
-          LLVM_DEBUG(llvm::dbgs() << "Called rock.dealloc multiple times?\n");
-          return WalkResult::interrupt();
-        }
-        deallocs.push_back(gpuDealloc);
-      }
-    }
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted()) {
-    LLVM_DEBUG(llvm::dbgs() << "Walk interrupted\n");
+  FailureOr<LDSInfo> maybeLdsInfo = createInterferenceGraph(func);
+  if (failed(maybeLdsInfo)) {
     return failure();
   }
+  LDSInfo ldsInfo = maybeLdsInfo.value();
 
-  // same number of rock.alloc and rock.dealloc
-  if (deallocs.size() != allocs.size() ||
-      allocs.size() != interferenceGraph.size() || !currentAllocs.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "There should be an equal number of rock.alloc "
-                               "and rock.dealloc (for LDS)\n");
-    return failure();
+  // add debug information
+  for (GpuAllocOp alloc : ldsInfo.allocs) {
+    auto type = alloc.getOutput().getType();
+    std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
+    assert(maybeSize.has_value());
+    int64_t size = maybeSize.value();
+    LLVM_DEBUG(llvm::dbgs() << "Found rock.alloc of " << size << " bytes\n");
+  }
+  for (GpuDeallocOp dealloc : ldsInfo.deallocs) {
+    auto type = dealloc.getMemref().getType();
+    std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
+    assert(maybeSize.has_value());
+    int64_t size = maybeSize.value();
+    LLVM_DEBUG(llvm::dbgs() << "Found rock.dealloc of " << size << " bytes\n");
   }
 
   // nothing to do if there is only one (or none) LDS allocation
-  if (interferenceGraph.size() < 2) {
+  if (ldsInfo.interferenceGraph.size() < 2) {
     LLVM_DEBUG(llvm::dbgs() << "Not enough LDS allocations, skipping pass\n");
     return success();
   }
 
   llvm::MapVector<int64_t, int64_t> colorSizes;
   SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>> allocOffsets;
-  std::tie(colorSizes, allocOffsets) =
-      graphColoring(allocs, interferenceGraph, deallocBefore);
-
-  int64_t requiredMemory = 0;
-  for (auto [_, size] : colorSizes) {
-    requiredMemory += size;
-  }
-
-  // not enough LDS memory
-  if (failed(checkLDSSize(func, requiredMemory))) {
-    LLVM_DEBUG(llvm::dbgs() << "ReuseLDS requires too much LDS memory: "
-                            << requiredMemory << " bytes\n");
-    return failure();
-  }
+  std::tie(colorSizes, allocOffsets) = graphColoring(ldsInfo);
 
   // write the new GpuAllocOp to the start
+  IRRewriter rewriter(func->getContext());
   rewriter.setInsertionPointToStart(&func.getBody().front());
 
   // New allocations
@@ -407,9 +153,9 @@ static LogicalResult reuseLDS(func::FuncOp &func) {
 
   // Remove all GpuDeallocOps but the last one and add a new alloc/dealloc pair
   // for each buffer
-  for (auto [i, dealloc] : llvm::enumerate(deallocs)) {
+  for (auto [i, dealloc] : llvm::enumerate(ldsInfo.deallocs)) {
     rewriter.setInsertionPointAfter(dealloc);
-    if (i == deallocs.size() - 1) {
+    if (i == ldsInfo.deallocs.size() - 1) {
       GpuDeallocOp prevDealloc = rewriter.replaceOpWithNewOp<GpuDeallocOp>(
           dealloc, std::get<1>(colorAllocs.front()));
       for (auto [i, pair] : llvm::enumerate(colorAllocs)) {
diff --git a/mlir/lib/Dialect/Rock/utility/CMakeLists.txt b/mlir/lib/Dialect/Rock/utility/CMakeLists.txt
index f0b834c71abb..423822d309ea 100644
--- a/mlir/lib/Dialect/Rock/utility/CMakeLists.txt
+++ b/mlir/lib/Dialect/Rock/utility/CMakeLists.txt
@@ -5,6 +5,7 @@ add_rocmlir_dialect_library(MLIRRockUtility
   loweringUtils.cpp
   transformMapUtils.cpp
   fusionUtils.cpp
+  memoryUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Rock
diff --git a/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp b/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp
new file mode 100644
index 000000000000..f577c4fba3fa
--- /dev/null
+++ b/mlir/lib/Dialect/Rock/utility/memoryUtils.cpp
@@ -0,0 +1,283 @@
+//===- memoryUtils.cpp - Rock memory utility functions
+//---------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------===//
+
+#include "mlir/Dialect/Rock/utility/memoryUtils.h"
+
+#include "mlir/Dialect/Rock/utility/AmdArchDb.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Rock/utility/builderUtils.h"
+#include "mlir/Dialect/Rock/utility/loweringUtils.h"
+
+namespace mlir {
+namespace rock {
+
+std::optional<int64_t> getWorkgroupMemorySize(MemRefType type) {
+  auto memSpaceValue =
+      dyn_cast_or_null<gpu::AddressSpaceAttr>(type.getMemorySpace()).getValue();
+  if (memSpaceValue == gpu::GPUDialect::getWorkgroupAddressSpace()) {
+    return type.getNumElements() * getByteWidth(type.getElementType());
+  }
+  return std::nullopt;
+}
+
+static void assignColors(
+    GpuAllocOp alloc, llvm::SetVector<int64_t> &usedColors,
+    llvm::MapVector<int64_t, int64_t> &colorMemSize,
+    llvm::SmallDenseMap<GpuAllocOp, SetVector<int64_t>> &colorAssignment) {
+  const std::optional<int64_t> maybeRequiredSize =
+      getWorkgroupMemorySize(alloc.getOutput().getType());
+  assert(maybeRequiredSize.has_value());
+  const int64_t requiredSize = maybeRequiredSize.value();
+  assert(requiredSize > 0);
+  int64_t color = 0;
+  int64_t allocatedSize = 0;
+
+  while (allocatedSize < requiredSize) {
+    if (usedColors.contains(color)) {
+      // found a used color, all the assigned colors must be consecutive
+      // let's start again
+      allocatedSize = 0;
+      colorAssignment[alloc].clear();
+      // Find the first available color
+      while (usedColors.contains(color)) {
+        color++;
+      }
+    }
+    colorAssignment[alloc].insert(color);
+    // New color
+    if (!colorMemSize.contains(color)) {
+      // Make this aligned to 128 bits
+      colorMemSize[color] = llvm::alignTo(requiredSize - allocatedSize, 16);
+    }
+    allocatedSize += colorMemSize[color];
+    color++;
+  }
+}
+
+LogicalResult checkLDSSize(Operation *op, int64_t ldsBytes) {
+  // Check for arch limitations exceeded
+  FailureOr<StringAttr> maybeArch = getArch(op);
+  if (succeeded(maybeArch)) {
+    StringAttr arch = maybeArch.value();
+    const int64_t ldsSize = rock::lookupArchInfo(arch).maxSharedMemPerWG;
+    return success(ldsBytes <= ldsSize);
+  }
+  return success();
+}
+
+std::tuple<llvm::MapVector<int64_t, int64_t>,
+           SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>>>
+graphColoring(LDSInfo &ldsInfo) {
+  llvm::SmallDenseMap<GpuAllocOp, SetVector<int64_t>> colorAssignment;
+  llvm::MapVector<int64_t, int64_t> colorMemSize;
+
+  SmallVector<GpuAllocOp> sortedAllocs(ldsInfo.allocs);
+
+  // Sort by alloc size
+  llvm::sort(sortedAllocs, [](GpuAllocOp &a, GpuAllocOp &b) {
+    auto aSize = getWorkgroupMemorySize(a.getOutput().getType());
+    auto bSize = getWorkgroupMemorySize(b.getOutput().getType());
+    assert(aSize.has_value() && bSize.has_value());
+    return aSize.value() < bSize.value();
+  });
+  // Assign colors using greedy algorithm
+  for (GpuAllocOp alloc : sortedAllocs) {
+    llvm::SetVector<int64_t> usedColors;
+    for (GpuAllocOp neighbor : ldsInfo.interferenceGraph[alloc]) {
+      if (colorAssignment.contains(neighbor)) {
+        for (int64_t color : colorAssignment[neighbor]) {
+          usedColors.insert(color);
+        }
+      }
+    }
+
+    // Assign a set of colors
+    assignColors(alloc, usedColors, colorMemSize, colorAssignment);
+  }
+
+  // If we replace all GpuAllocOps with a single one, we run into
+  // aliasing issues that cause performance regressions.
+  // In order to avoid that, we first merge colors so that each
+  // GpuAlloc is assigned a single color and an offset.
+  // Then, we can generate more than one GpuAllocOp and improve
+  // aliasing issues.
+  // This might be removed in the future if aliasing issues are solved.
+  llvm::SmallDenseMap<int64_t, SetVector<int64_t>> colorsToMerge;
+  llvm::SmallDenseMap<int64_t, int64_t> oldColorToNew;
+  for (GpuAllocOp alloc : sortedAllocs) {
+    int64_t firstColor = colorAssignment[alloc][0];
+    // if the color has been replaced already
+    if (oldColorToNew.contains(firstColor)) {
+      int64_t newColor = oldColorToNew[firstColor];
+      // assign all the colors of the current 'alloc' to newColor
+      for (int64_t color : colorAssignment[alloc]) {
+        oldColorToNew[color] = newColor;
+        colorsToMerge[newColor].insert(color);
+      }
+    } else {
+      // the color has not been replaced yet
+      for (auto [i, color] : llvm::enumerate(colorAssignment[alloc])) {
+        // merge all non-first colors with the first one
+        if (i > 0) {
+          oldColorToNew[color] = firstColor;
+          colorsToMerge[firstColor].insert(color);
+          // if the current 'color' has merged some colors,
+          // merge its colors to 'firstColor'
+          if (colorsToMerge.contains(color)) {
+            for (int64_t otherColor : colorsToMerge[color]) {
+              oldColorToNew[otherColor] = firstColor;
+              colorsToMerge[firstColor].insert(otherColor);
+            }
+            colorsToMerge.erase(color);
+          }
+        }
+      }
+    }
+  }
+  // Compute offsets and new sizes (after merging)
+  llvm::MapVector<int64_t, int64_t> mergedColorMemSize(colorMemSize);
+  llvm::MapVector<int64_t, int64_t> colorOffset;
+  for (auto [color, _] : colorMemSize) {
+    colorOffset[color] = 0;
+  }
+  for (auto [color, oldColors] : colorsToMerge) {
+    for (int64_t oldColor : oldColors) {
+      assert(oldColor > color);
+      colorOffset[oldColor] = mergedColorMemSize[color];
+      mergedColorMemSize[color] += mergedColorMemSize[oldColor];
+      mergedColorMemSize.erase(oldColor);
+    }
+  }
+
+  // Compute information per GpuAllocOp
+  SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>> gpuAllocInfo;
+  llvm::SetVector<int64_t> usedColors;
+  for (const GpuAllocOp alloc : ldsInfo.allocs) {
+    assert(colorAssignment.contains(alloc));
+
+    // if the color has been used, we are "reusing" memory,
+    // we need a LDS barrier
+    bool useLDSBarrier = false;
+    for (int64_t color : colorAssignment[alloc]) {
+      useLDSBarrier |= usedColors.contains(color);
+      usedColors.insert(color);
+    }
+
+    if (useLDSBarrier) {
+      for (GpuAllocOp deadAlloc : ldsInfo.deallocBefore[alloc]) {
+        for (int64_t color : colorAssignment[deadAlloc]) {
+          if (!colorAssignment[alloc].contains(color)) {
+            usedColors.remove(color);
+          }
+        }
+      }
+    }
+
+    int64_t oldColor = colorAssignment[alloc][0];
+    assert(colorOffset.contains(oldColor));
+    int64_t offset = colorOffset[oldColor];
+    int64_t newColor =
+        (oldColorToNew.contains(oldColor)) ? oldColorToNew[oldColor] : oldColor;
+    gpuAllocInfo.push_back(std::tuple(alloc, newColor, offset, useLDSBarrier));
+  }
+
+  return std::tuple(mergedColorMemSize, gpuAllocInfo);
+}
+
+FailureOr<LDSInfo> createInterferenceGraph(func::FuncOp &func) {
+  LDSInfo ldsInfo;
+  SetVector<GpuAllocOp> currentAllocs;
+  llvm::SmallDenseMap<Value, GpuAllocOp> memrefToAlloc;
+  llvm::SetVector<GpuAllocOp> deallocsUpToNow;
+
+  // Create the interference graph and save all allocs and deallocs (LDS)
+  WalkResult walkResult = func.walk([&](Operation *op) -> WalkResult {
+    if (auto gpuAlloc = dyn_cast<GpuAllocOp>(op)) {
+      auto type = gpuAlloc.getOutput().getType();
+
+      std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
+      if (maybeSize.has_value()) {
+        // save deallocs up to this point
+        ldsInfo.deallocBefore[gpuAlloc] = SetVector<GpuAllocOp>(
+            deallocsUpToNow.begin(), deallocsUpToNow.end());
+        deallocsUpToNow.clear();
+
+        // add vertex and connections
+        for (auto alloc : currentAllocs) {
+          ldsInfo.interferenceGraph[alloc].insert(gpuAlloc);
+          ldsInfo.interferenceGraph[gpuAlloc].insert(alloc);
+        }
+        // if it has no neighbors, we still want to add a vertex
+        if (currentAllocs.empty()) {
+          ldsInfo.interferenceGraph[gpuAlloc] = {};
+        }
+        currentAllocs.insert(gpuAlloc);
+        memrefToAlloc[gpuAlloc.getOutput()] = gpuAlloc;
+        ldsInfo.allocs.push_back(gpuAlloc);
+      }
+    } else if (auto gpuDealloc = dyn_cast<GpuDeallocOp>(op)) {
+      auto type = gpuDealloc.getMemref().getType();
+      std::optional<int64_t> maybeSize = getWorkgroupMemorySize(type);
+      if (maybeSize.has_value()) {
+        if (memrefToAlloc.find(gpuDealloc.getMemref()) == memrefToAlloc.end()) {
+          return WalkResult::interrupt();
+        }
+        bool erased =
+            currentAllocs.remove(memrefToAlloc[gpuDealloc.getMemref()]);
+        deallocsUpToNow.insert(memrefToAlloc[gpuDealloc.getMemref()]);
+        if (!erased) {
+          return WalkResult::interrupt();
+        }
+        ldsInfo.deallocs.push_back(gpuDealloc);
+      }
+    }
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted()) {
+    if (ldsInfo.allocs.empty())
+      return emitError(UnknownLoc::get(func.getContext()), "Unexpected error");
+    return ldsInfo.allocs.front().emitError(
+        "Called rock.dealloc multiple times");
+  }
+
+  // same number of rock.alloc and rock.dealloc
+  if (ldsInfo.deallocs.size() != ldsInfo.allocs.size() ||
+      ldsInfo.allocs.size() != ldsInfo.interferenceGraph.size() ||
+      !currentAllocs.empty()) {
+    return emitError(UnknownLoc::get(func.getContext()),
+                     "There should be an equal number of rock.alloc and "
+                     "rock.dealloc (for LDS)");
+  }
+
+  return ldsInfo;
+}
+
+FailureOr<int64_t> getAllocatedLDSAfterReuse(func::FuncOp &func) {
+  FailureOr<LDSInfo> maybeLdsInfo = createInterferenceGraph(func);
+  if (failed(maybeLdsInfo)) {
+    return failure();
+  }
+  LDSInfo ldsInfo = maybeLdsInfo.value();
+
+  llvm::MapVector<int64_t, int64_t> colorSizes;
+  SmallVector<std::tuple<GpuAllocOp, int64_t, int64_t, bool>> allocOffsets;
+  std::tie(colorSizes, allocOffsets) = graphColoring(ldsInfo);
+
+  int64_t requiredMemory = 0;
+  for (auto [_, size] : colorSizes) {
+    requiredMemory += size;
+  }
+
+  return requiredMemory;
+}
+
+} // namespace rock
+} // namespace mlir
\ No newline at end of file