forked from oneapi-src/unified-runtime
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request oneapi-src#1958 from igchor/kernel_helpers
[L0] move kernel helper functions to a separate file
- Loading branch information
Showing
6 changed files
with
257 additions
and
198 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
//===--------- kernel_helpers.cpp - Level Zero Adapter -------------------===// | ||
// | ||
// Copyright (C) 2024 Intel Corporation | ||
// | ||
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM | ||
// Exceptions. See LICENSE.TXT | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "kernel_helpers.hpp" | ||
#include "logger/ur_logger.hpp" | ||
|
||
#include "../common.hpp" | ||
#include "../context.hpp" | ||
#include "../device.hpp" | ||
|
||
ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, | ||
ze_kernel_handle_t hZeKernel, | ||
size_t GlobalWorkSize3D[3], | ||
uint32_t SuggestedLocalWorkSize3D[3]) { | ||
uint32_t *WG = SuggestedLocalWorkSize3D; | ||
|
||
// We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize | ||
// values do not fit to 32-bit that the API only supports currently. | ||
bool SuggestGroupSize = true; | ||
for (int I : {0, 1, 2}) { | ||
if (GlobalWorkSize3D[I] > UINT32_MAX) { | ||
SuggestGroupSize = false; | ||
} | ||
} | ||
if (SuggestGroupSize) { | ||
ZE2UR_CALL(zeKernelSuggestGroupSize, | ||
(hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], | ||
GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); | ||
} else { | ||
for (int I : {0, 1, 2}) { | ||
// Try to find a I-dimension WG size that the GlobalWorkSize[I] is | ||
// fully divisable with. Start with the max possible size in | ||
// each dimension. | ||
uint32_t GroupSize[] = { | ||
hDevice->ZeDeviceComputeProperties->maxGroupSizeX, | ||
hDevice->ZeDeviceComputeProperties->maxGroupSizeY, | ||
hDevice->ZeDeviceComputeProperties->maxGroupSizeZ}; | ||
GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); | ||
while (GlobalWorkSize3D[I] % GroupSize[I]) { | ||
--GroupSize[I]; | ||
} | ||
if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { | ||
logger::error("getSuggestedLocalWorkSize: can't find a WG size " | ||
"suitable for global work size > UINT32_MAX"); | ||
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; | ||
} | ||
WG[I] = GroupSize[I]; | ||
} | ||
logger::debug( | ||
"getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}", | ||
WG[0], WG[1], WG[2]); | ||
} | ||
|
||
return UR_RESULT_SUCCESS; | ||
} | ||
|
||
ur_result_t setKernelGlobalOffset(ur_context_handle_t Context, | ||
ze_kernel_handle_t Kernel, | ||
const size_t *GlobalWorkOffset) { | ||
if (!Context->getPlatform()->ZeDriverGlobalOffsetExtensionFound) { | ||
logger::debug("No global offset extension found on this driver"); | ||
return UR_RESULT_ERROR_INVALID_VALUE; | ||
} | ||
|
||
ZE2UR_CALL( | ||
zeKernelSetGlobalOffsetExp, | ||
(Kernel, GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2])); | ||
|
||
return UR_RESULT_SUCCESS; | ||
} | ||
|
||
ur_result_t calculateKernelWorkDimensions( | ||
ze_kernel_handle_t Kernel, ur_device_handle_t Device, | ||
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], | ||
uint32_t WorkDim, const size_t *GlobalWorkSize, | ||
const size_t *LocalWorkSize) { | ||
|
||
UR_ASSERT(GlobalWorkSize, UR_RESULT_ERROR_INVALID_VALUE); | ||
// If LocalWorkSize is not provided then Kernel must be provided to query | ||
// suggested group size. | ||
UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE); | ||
|
||
// New variable needed because GlobalWorkSize parameter might not be of size | ||
// 3 | ||
size_t GlobalWorkSize3D[3]{1, 1, 1}; | ||
std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); | ||
|
||
if (LocalWorkSize) { | ||
WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]); | ||
WG[1] = WorkDim >= 2 ? ur_cast<uint32_t>(LocalWorkSize[1]) : 1; | ||
WG[2] = WorkDim == 3 ? ur_cast<uint32_t>(LocalWorkSize[2]) : 1; | ||
} else { | ||
UR_CALL(getSuggestedLocalWorkSize(Device, Kernel, GlobalWorkSize3D, WG)); | ||
} | ||
|
||
// TODO: assert if sizes do not fit into 32-bit? | ||
switch (WorkDim) { | ||
case 3: | ||
ZeThreadGroupDimensions.groupCountX = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]); | ||
ZeThreadGroupDimensions.groupCountY = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]); | ||
ZeThreadGroupDimensions.groupCountZ = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]); | ||
break; | ||
case 2: | ||
ZeThreadGroupDimensions.groupCountX = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]); | ||
ZeThreadGroupDimensions.groupCountY = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]); | ||
WG[2] = 1; | ||
break; | ||
case 1: | ||
ZeThreadGroupDimensions.groupCountX = | ||
ur_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]); | ||
WG[1] = WG[2] = 1; | ||
break; | ||
|
||
default: | ||
logger::error("calculateKernelWorkDimensions: unsupported work_dim"); | ||
return UR_RESULT_ERROR_INVALID_VALUE; | ||
} | ||
|
||
// Error handling for non-uniform group size case | ||
if (GlobalWorkSize3D[0] != | ||
size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { | ||
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " | ||
"is not a multiple of the group size in the 1st dimension"); | ||
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; | ||
} | ||
if (GlobalWorkSize3D[1] != | ||
size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { | ||
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " | ||
"is not a multiple of the group size in the 2nd dimension"); | ||
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; | ||
} | ||
if (GlobalWorkSize3D[2] != | ||
size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { | ||
logger::error("calculateKernelWorkDimensions: invalid work_dim. The range " | ||
"is not a multiple of the group size in the 3rd dimension"); | ||
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; | ||
} | ||
|
||
return UR_RESULT_SUCCESS; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
//===--------- kernel_helpers.hpp - Level Zero Adapter -------------------===// | ||
// | ||
// Copyright (C) 2024 Intel Corporation | ||
// | ||
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM | ||
// Exceptions. See LICENSE.TXT | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include <ur_api.h> | ||
#include <ze_api.h> | ||
|
||
/** | ||
* Calculates a work group size for the kernel based on the GlobalWorkSize or | ||
* the LocalWorkSize if provided. | ||
* @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not | ||
* provided. | ||
* @param[in][optional] Device The device associated with the kernel. Used when | ||
* LocalWorkSize is not provided. | ||
* @param[out] ZeThreadGroupDimensions Number of work groups in each dimension. | ||
* @param[out] WG The work group size for each dimension. | ||
* @param[in] WorkDim The number of dimensions in the kernel. | ||
* @param[in] GlobalWorkSize The global work size. | ||
* @param[in][optional] LocalWorkSize The local work size. | ||
* @return UR_RESULT_SUCCESS or an error code on failure. | ||
*/ | ||
ur_result_t calculateKernelWorkDimensions( | ||
ze_kernel_handle_t Kernel, ur_device_handle_t Device, | ||
ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], | ||
uint32_t WorkDim, const size_t *GlobalWorkSize, | ||
const size_t *LocalWorkSize); | ||
|
||
/** | ||
* Sets the global offset for a kernel command that will be appended to the | ||
* command buffer. | ||
* @param[in] Context Context associated with the queue. | ||
* @param[in] Kernel The handle to the kernel that will be appended. | ||
* @param[in] GlobalWorkOffset The global offset value. | ||
* @return UR_RESULT_SUCCESS or an error code on failure | ||
*/ | ||
ur_result_t setKernelGlobalOffset(ur_context_handle_t Context, | ||
ze_kernel_handle_t Kernel, | ||
const size_t *GlobalWorkOffset); | ||
|
||
/** | ||
* Get the suggested local work size for a kernel. | ||
* @param[in] hDevice The device associated with the kernel. | ||
* @param[in] hZeKernel The kernel handle. | ||
* @param[in] GlobalWorkSize3D The global work size. | ||
* @param[out] SuggestedLocalWorkSize3D The suggested local work size. | ||
* @return UR_RESULT_SUCCESS or an error code on failure. | ||
*/ | ||
ur_result_t getSuggestedLocalWorkSize(ur_device_handle_t hDevice, | ||
ze_kernel_handle_t hZeKernel, | ||
size_t GlobalWorkSize3D[3], | ||
uint32_t SuggestedLocalWorkSize3D[3]); |
Oops, something went wrong.