Skip to content

Commit

Permalink
Merge pull request oneapi-src#1958 from igchor/kernel_helpers
Browse files Browse the repository at this point in the history
[L0] move kernel helper functions to a separate file
  • Loading branch information
igchor authored Aug 13, 2024
2 parents 78c003e + 5001a40 commit 6e8efa3
Show file tree
Hide file tree
Showing 6 changed files with 257 additions and 198 deletions.
2 changes: 2 additions & 0 deletions source/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ if(UR_BUILD_ADAPTER_L0)
${CMAKE_CURRENT_SOURCE_DIR}/queue_api.hpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
Expand All @@ -130,6 +131,7 @@ if(UR_BUILD_ADAPTER_L0)
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
)

Expand Down
131 changes: 4 additions & 127 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
//
//===----------------------------------------------------------------------===//
#include "command_buffer.hpp"
#include "helpers/kernel_helpers.hpp"
#include "logger/ur_logger.hpp"
#include "ur_level_zero.hpp"

Expand Down Expand Up @@ -78,130 +79,6 @@ preferCopyEngineForFill(ur_exp_command_buffer_handle_t CommandBuffer,
return UR_RESULT_SUCCESS;
}

/**
* Calculates a work group size for the kernel based on the GlobalWorkSize or
* the LocalWorkSize if provided.
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not
* provided.
* @param[in][optional] Device The device associated with the kernel. Used when
* LocalWorkSize is not provided.
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension.
* @param[out] WG The work group size for each dimension.
* @param[in] WorkDim The number of dimensions in the kernel.
* @param[in] GlobalWorkSize The global work size.
* @param[in][optional] LocalWorkSize The local work size.
* @return UR_RESULT_SUCCESS or an error code on failure.
*/
ur_result_t calculateKernelWorkDimensions(
ur_kernel_handle_t Kernel, ur_device_handle_t Device,
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
uint32_t WorkDim, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize) {

UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
// If LocalWorkSize is not provided then Kernel must be provided to query
// suggested group size.
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);

// New variable needed because GlobalWorkSize parameter might not be of size
// 3
size_t GlobalWorkSize3D[3]{1, 1, 1};
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);

if (LocalWorkSize) {
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
} else {
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize3D
// values do not fit to 32-bit that the API only supports currently.
bool SuggestGroupSize = true;
for (int I : {0, 1, 2}) {
if (GlobalWorkSize3D[I] > UINT32_MAX) {
SuggestGroupSize = false;
}
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(Kernel->ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
// Try to find a I-dimension WG size that the GlobalWorkSize3D[I] is
// fully divisable with. Start with the max possible size in
// each dimension.
uint32_t GroupSize[] = {
Device->ZeDeviceComputeProperties->maxGroupSizeX,
Device->ZeDeviceComputeProperties->maxGroupSizeY,
Device->ZeDeviceComputeProperties->maxGroupSizeZ};
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
while (GlobalWorkSize3D[I] % GroupSize[I]) {
--GroupSize[I];
}
if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
logger::debug("calculateKernelWorkDimensions: can't find a WG size "
"suitable for global work size > UINT32_MAX");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
WG[I] = GroupSize[I];
}
logger::debug("calculateKernelWorkDimensions: using computed WG "
"size = {{{}, {}, {}}}",
WG[0], WG[1], WG[2]);
}
}

// TODO: assert if sizes do not fit into 32-bit?
switch (WorkDim) {
case 3:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
ZeThreadGroupDimensions.groupCountZ =
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
break;
case 2:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
WG[2] = 1;
break;
case 1:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
WG[1] = WG[2] = 1;
break;

default:
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
return UR_RESULT_ERROR_INVALID_VALUE;
}

// Error handling for non-uniform group size case
if (GlobalWorkSize3D[0] !=
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 1st dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[1] !=
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 2nd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[2] !=
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 3rd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

return UR_RESULT_SUCCESS;
}

/**
* Helper function for finding the Level Zero events associated with the
* commands in a command-buffer, each event is pointed to by a sync-point in the
Expand Down Expand Up @@ -880,7 +757,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(

ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
uint32_t WG[3];
UR_CALL(calculateKernelWorkDimensions(Kernel, CommandBuffer->Device,
UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, CommandBuffer->Device,
ZeThreadGroupDimensions, WG, WorkDim,
GlobalWorkSize, LocalWorkSize));

Expand Down Expand Up @@ -1587,8 +1464,8 @@ ur_result_t updateKernelCommand(

uint32_t WG[3];
UR_CALL(calculateKernelWorkDimensions(
Command->Kernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG,
Dim, NewGlobalWorkSize, NewLocalWorkSize));
Command->Kernel->ZeKernel, CommandBuffer->Device,
ZeThreadGroupDimensions, WG, Dim, NewGlobalWorkSize, NewLocalWorkSize));

auto MutableGroupCountDesc =
std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
Expand Down
152 changes: 152 additions & 0 deletions source/adapters/level_zero/helpers/kernel_helpers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
//===--------- kernel_helpers.cpp - Level Zero Adapter -------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "kernel_helpers.hpp"
#include "logger/ur_logger.hpp"

#include "../common.hpp"
#include "../context.hpp"
#include "../device.hpp"

ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice,
ze_kernel_handle_t hZeKernel,
size_t GlobalWorkSize3D[3],
uint32_t SuggestedLocalWorkSize3D[3]) {
uint32_t *WG = SuggestedLocalWorkSize3D;

// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
// values do not fit to 32-bit that the API only supports currently.
bool SuggestGroupSize = true;
for (int I : {0, 1, 2}) {
if (GlobalWorkSize3D[I] > UINT32_MAX) {
SuggestGroupSize = false;
}
}
if (SuggestGroupSize) {
ZE2UR_CALL(zeKernelSuggestGroupSize,
(hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
} else {
for (int I : {0, 1, 2}) {
// Try to find a I-dimension WG size that the GlobalWorkSize[I] is
// fully divisable with. Start with the max possible size in
// each dimension.
uint32_t GroupSize[] = {
hDevice->ZeDeviceComputeProperties->maxGroupSizeX,
hDevice->ZeDeviceComputeProperties->maxGroupSizeY,
hDevice->ZeDeviceComputeProperties->maxGroupSizeZ};
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
while (GlobalWorkSize3D[I] % GroupSize[I]) {
--GroupSize[I];
}
if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
logger::error("getSuggestedLocalWorkSize: can't find a WG size "
"suitable for global work size > UINT32_MAX");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
WG[I] = GroupSize[I];
}
logger::debug(
"getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}",
WG[0], WG[1], WG[2]);
}

return UR_RESULT_SUCCESS;
}

ur_result_t setKernelGlobalOffset(ur_context_handle_t Context,
ze_kernel_handle_t Kernel,
const size_t *GlobalWorkOffset) {
if (!Context->getPlatform()->ZeDriverGlobalOffsetExtensionFound) {
logger::debug("No global offset extension found on this driver");
return UR_RESULT_ERROR_INVALID_VALUE;
}

ZE2UR_CALL(
zeKernelSetGlobalOffsetExp,
(Kernel, GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2]));

return UR_RESULT_SUCCESS;
}

ur_result_t calculateKernelWorkDimensions(
ze_kernel_handle_t Kernel, ur_device_handle_t Device,
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
uint32_t WorkDim, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize) {

UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE);
// If LocalWorkSize is not provided then Kernel must be provided to query
// suggested group size.
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE);

// New variable needed because GlobalWorkSize parameter might not be of size
// 3
size_t GlobalWorkSize3D[3]{1, 1, 1};
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);

if (LocalWorkSize) {
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1;
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1;
} else {
UR_CALL(getSuggestedLocalWorkSize(Device, Kernel, GlobalWorkSize3D, WG));
}

// TODO: assert if sizes do not fit into 32-bit?
switch (WorkDim) {
case 3:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
ZeThreadGroupDimensions.groupCountZ =
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
break;
case 2:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
ZeThreadGroupDimensions.groupCountY =
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
WG[2] = 1;
break;
case 1:
ZeThreadGroupDimensions.groupCountX =
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
WG[1] = WG[2] = 1;
break;

default:
logger::error("calculateKernelWorkDimensions: unsupported work_dim");
return UR_RESULT_ERROR_INVALID_VALUE;
}

// Error handling for non-uniform group size case
if (GlobalWorkSize3D[0] !=
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 1st dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[1] !=
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 2nd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
if (GlobalWorkSize3D[2] !=
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range "
"is not a multiple of the group size in the 3rd dimension");
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

return UR_RESULT_SUCCESS;
}
57 changes: 57 additions & 0 deletions source/adapters/level_zero/helpers/kernel_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//===--------- kernel_helpers.hpp - Level Zero Adapter -------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <ur_api.h>
#include <ze_api.h>

/**
* Calculates a work group size for the kernel based on the GlobalWorkSize or
* the LocalWorkSize if provided.
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not
* provided.
* @param[in][optional] Device The device associated with the kernel. Used when
* LocalWorkSize is not provided.
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension.
* @param[out] WG The work group size for each dimension.
* @param[in] WorkDim The number of dimensions in the kernel.
* @param[in] GlobalWorkSize The global work size.
* @param[in][optional] LocalWorkSize The local work size.
* @return UR_RESULT_SUCCESS or an error code on failure.
*/
ur_result_t calculateKernelWorkDimensions(
ze_kernel_handle_t Kernel, ur_device_handle_t Device,
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3],
uint32_t WorkDim, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize);

/**
* Sets the global offset for a kernel command that will be appended to the
* command buffer.
* @param[in] Context Context associated with the queue.
* @param[in] Kernel The handle to the kernel that will be appended.
* @param[in] GlobalWorkOffset The global offset value.
* @return UR_RESULT_SUCCESS or an error code on failure
*/
ur_result_t setKernelGlobalOffset(ur_context_handle_t Context,
ze_kernel_handle_t Kernel,
const size_t *GlobalWorkOffset);

/**
* Get the suggested local work size for a kernel.
* @param[in] hDevice The device associated with the kernel.
* @param[in] hZeKernel The kernel handle.
* @param[in] GlobalWorkSize3D The global work size.
* @param[out] SuggestedLocalWorkSize3D The suggested local work size.
* @return UR_RESULT_SUCCESS or an error code on failure.
*/
ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice,
ze_kernel_handle_t hZeKernel,
size_t GlobalWorkSize3D[3],
uint32_t SuggestedLocalWorkSize3D[3]);
Loading

0 comments on commit 6e8efa3

Please sign in to comment.