Skip to content

Commit

Permalink
Add a libcudf/thrust-based TPC-H derived datagen (#16294)
Browse files Browse the repository at this point in the history
This PR adds a TPC-H (according to spec 3.0.1) inspired datagen written using `libcudf` and `thrust`

### Implementation Status

- [x] lineitem
- [x] orders
- [x] region 
- [x] nation
- [x] supplier 
- [x] customer
- [x] part
- [x] partsupp

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #16294
  • Loading branch information
JayjeetAtGithub authored Aug 30, 2024
1 parent 62a53b3 commit 23fb31e
Show file tree
Hide file tree
Showing 7 changed files with 2,042 additions and 0 deletions.
24 changes: 24 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,30 @@ target_include_directories(
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)

add_library(
tpch_data_generator STATIC
common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
common/tpch_data_generator/random_column_generator.cu
)
target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(
tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
)

target_link_libraries(
tpch_data_generator
PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
)

target_include_directories(
tpch_data_generator
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)

# ##################################################################################################
# * compiler function -----------------------------------------------------------------------------

Expand Down
246 changes: 246 additions & 0 deletions cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "random_column_generator.hpp"

#include <cudf_test/column_wrapper.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/detail/strings_children.cuh>

#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>

#include <string>

namespace cudf::datagen {

namespace {

// Functor for generating random strings
struct random_string_generator {
char* chars;
thrust::default_random_engine engine;
thrust::uniform_int_distribution<unsigned char> char_dist;

CUDF_HOST_DEVICE random_string_generator(char* c) : chars(c), char_dist(44, 122) {}

__device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
{
auto begin = thrust::get<0>(str_begin_end);
auto end = thrust::get<1>(str_begin_end);
engine.discard(begin);
for (auto i = begin; i < end; ++i) {
auto ch = char_dist(engine);
if (i == end - 1 && ch >= '\x7F') ch = ' '; // last element ASCII only.
if (ch >= '\x7F') // x7F is at the top edge of ASCII
chars[i++] = '\xC4'; // these characters are assigned two bytes
chars[i] = static_cast<char>(ch + (ch >= '\x7F'));
}
}
};

// Functor for generating random numbers
template <typename T>
struct random_number_generator {
T lower;
T upper;

CUDF_HOST_DEVICE random_number_generator(T lower, T upper) : lower(lower), upper(upper) {}

__device__ T operator()(const int64_t idx) const
{
if constexpr (cudf::is_integral<T>()) {
thrust::default_random_engine engine;
thrust::uniform_int_distribution<T> dist(lower, upper);
engine.discard(idx);
return dist(engine);
} else {
thrust::default_random_engine engine;
thrust::uniform_real_distribution<T> dist(lower, upper);
engine.discard(idx);
return dist(engine);
}
}
};

} // namespace

std::unique_ptr<cudf::column> generate_random_string_column(cudf::size_type lower,
cudf::size_type upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto offsets_begin = cudf::detail::make_counting_transform_iterator(
0, random_number_generator<cudf::size_type>(lower, upper));
auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_begin, offsets_begin + num_rows, stream, mr);
rmm::device_uvector<char> chars(computed_bytes, stream);

auto const offset_itr =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// We generate the strings in parallel into the `chars` vector using the
// offsets vector generated above.
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(offset_itr, offset_itr + 1),
num_rows,
random_string_generator(chars.data()));

return cudf::make_strings_column(
num_rows, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
}

template <typename T>
std::unique_ptr<cudf::column> generate_random_numeric_column(T lower,
T upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto col = cudf::make_numeric_column(
cudf::data_type{cudf::type_to_id<T>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
cudf::size_type begin = 0;
cudf::size_type end = num_rows;
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator(begin),
thrust::make_counting_iterator(end),
col->mutable_view().begin<T>(),
random_number_generator<T>(lower, upper));
return col;
}

template std::unique_ptr<cudf::column> generate_random_numeric_column<int8_t>(
int8_t lower,
int8_t upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<int16_t>(
int16_t lower,
int16_t upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<cudf::size_type>(
cudf::size_type lower,
cudf::size_type upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_random_numeric_column<double>(
double lower,
double upper,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

std::unique_ptr<cudf::column> generate_primary_key_column(cudf::scalar const& start,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return cudf::sequence(num_rows, start, stream, mr);
}

std::unique_ptr<cudf::column> generate_repeat_string_column(std::string const& value,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto const scalar = cudf::string_scalar(value);
return cudf::make_column_from_scalar(scalar, num_rows, stream, mr);
}

std::unique_ptr<cudf::column> generate_random_string_column_from_set(
cudf::host_span<const char* const> set,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
// Build a gather map of random strings to choose from
// The size of the string sets always fits within 16-bit integers
auto const indices =
generate_primary_key_column(cudf::numeric_scalar<int16_t>(0), set.size(), stream, mr);
auto const keys = cudf::test::strings_column_wrapper(set.begin(), set.end()).release();
auto const gather_map = cudf::table_view({indices->view(), keys->view()});

// Build a column of random keys to gather from the set
auto const gather_keys =
generate_random_numeric_column<int16_t>(0, set.size() - 1, num_rows, stream, mr);

// Perform the gather operation
auto const gathered_table = cudf::gather(
gather_map, gather_keys->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
auto gathered_table_columns = gathered_table->release();
return std::move(gathered_table_columns[1]);
}

template <typename T>
std::unique_ptr<cudf::column> generate_repeat_sequence_column(T seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto pkey =
generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(0), num_rows, stream, mr);
auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(),
cudf::numeric_scalar<T>(seq_length),
cudf::binary_operator::MOD,
cudf::data_type{cudf::type_to_id<T>()},
stream,
mr);
if (zero_indexed) { return repeat_seq_zero_indexed; }
return cudf::binary_operation(repeat_seq_zero_indexed->view(),
cudf::numeric_scalar<T>(1),
cudf::binary_operator::ADD,
cudf::data_type{cudf::type_to_id<T>()},
stream,
mr);
}

template std::unique_ptr<cudf::column> generate_repeat_sequence_column<int8_t>(
int8_t seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

template std::unique_ptr<cudf::column> generate_repeat_sequence_column<cudf::size_type>(
cudf::size_type seq_length,
bool zero_indexed,
cudf::size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

} // namespace cudf::datagen
Loading

0 comments on commit 23fb31e

Please sign in to comment.