Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON tree algorithms refactor I: CSR data structure for column tree #15979

Merged
merged 75 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
1ec9617
added csr data struct
shrshi Jun 11, 2024
022d7ce
formatting
shrshi Jun 11, 2024
382633f
added test
shrshi Jun 25, 2024
1823854
formatting
shrshi Jun 25, 2024
4a7e2a5
Merge branch 'branch-24.08' into json-tree-refactor-ii
shrshi Jun 25, 2024
8d5ddfb
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jun 26, 2024
84a7749
fixing csr construction
shrshi Jun 28, 2024
810c389
moving the csr algorithms
shrshi Jun 28, 2024
6a1a415
formatting
shrshi Jun 28, 2024
85c197d
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jun 28, 2024
996c6dd
Merge branch 'json-tree-refactor' of github.com:shrshi/cudf into json…
shrshi Jun 28, 2024
4bba629
moving to experimental namespace
shrshi Jul 15, 2024
25530f6
Merge branch 'branch-24.08' into json-tree-refactor
shrshi Jul 15, 2024
df9e65b
formatting
shrshi Jul 15, 2024
d1588c8
removed node properties from csr struct - will be introduced in stage…
shrshi Jul 15, 2024
7e1a756
merging branch 24.08 into current branch
shrshi Jul 24, 2024
5541b93
partial commit
shrshi Jul 24, 2024
1490ce9
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Jul 24, 2024
d05e670
better csr construction
shrshi Jul 30, 2024
1ce88be
formatting
shrshi Jul 30, 2024
d6d724c
exec policy is no sync
shrshi Jul 30, 2024
2622d6b
fix copyright year
shrshi Jul 30, 2024
9498372
fixing max row offsets
shrshi Jul 31, 2024
4339b0a
formatting
shrshi Jul 31, 2024
e61288b
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Jul 31, 2024
9b6b7ff
struct docs
shrshi Jul 31, 2024
53db174
Merge branch 'json-tree-refactor' of github.com:shrshi/cudf into json…
shrshi Jul 31, 2024
85608eb
cudf exports!
shrshi Jul 31, 2024
f451c40
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Sep 6, 2024
e29656d
deduplicating code
shrshi Sep 6, 2024
e6eda41
formatting
shrshi Sep 6, 2024
bf4f191
addressing reviews - 1
shrshi Sep 6, 2024
55e943a
addressing reviews - 2
shrshi Sep 6, 2024
4e00526
tsk tsk should have run compute sanitizer sooner
shrshi Sep 6, 2024
ca7a5f3
addressing reviews - 3
shrshi Sep 6, 2024
14664db
addressing reviews - 4
shrshi Sep 6, 2024
63eec8a
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Sep 11, 2024
5f4aca6
adding more tests; debugging on the way
shrshi Sep 13, 2024
e6a9941
formatting
shrshi Sep 13, 2024
82c9ebe
added more tests; fixed bugs
shrshi Sep 17, 2024
8dd6877
formatting
shrshi Sep 17, 2024
0c63f22
finally tests passing
shrshi Sep 18, 2024
2d4861e
fixed all bugs hopefully
shrshi Sep 19, 2024
7759a91
formatting
shrshi Sep 19, 2024
e5d4a35
pr reviews
shrshi Sep 20, 2024
3cdc211
exec policy sync -> nosync
shrshi Sep 20, 2024
9ca7b5e
pr reviews
shrshi Sep 20, 2024
29be430
cleanup
shrshi Sep 20, 2024
023a4a8
moving steps to lambdas to handle intermediate vectors
shrshi Sep 20, 2024
ded2c5e
formatting
shrshi Sep 20, 2024
b2d11dd
more lambdas
shrshi Sep 20, 2024
7260ae6
formatting
shrshi Sep 20, 2024
827756e
moving the og reduce to column tree back to json_column.cu
shrshi Sep 21, 2024
219640f
formatting
shrshi Sep 21, 2024
c20cc15
merge
shrshi Sep 22, 2024
28b9e9f
fixing bad merge
shrshi Sep 22, 2024
529e88e
simplifying; using previous reduce to column tree results
shrshi Sep 23, 2024
545359d
formatting
shrshi Sep 23, 2024
24be71f
simplifying
shrshi Sep 23, 2024
6e1da07
formatting
shrshi Sep 23, 2024
bab5d75
remove debugging code
shrshi Sep 23, 2024
c826144
remove debug printing
shrshi Sep 23, 2024
937263f
reviews
shrshi Sep 23, 2024
7952cd7
reviews
shrshi Sep 23, 2024
adc74b6
some more cleanup
shrshi Sep 23, 2024
37e7511
reviews
shrshi Sep 23, 2024
2c37e42
pr reviews
shrshi Sep 23, 2024
278f391
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Sep 24, 2024
3442ebc
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Sep 24, 2024
896a9b2
Merge branch 'branch-24.10' into json-tree-refactor
vuule Sep 24, 2024
a6b61f8
Merge branch 'branch-24.10' into json-tree-refactor
galipremsagar Sep 24, 2024
9f041a6
pr reviews
shrshi Sep 25, 2024
3c7f8ba
Merge branch 'json-tree-refactor' of github.com:shrshi/cudf into json…
shrshi Sep 25, 2024
9680695
Merge branch 'branch-24.10' into json-tree-refactor
shrshi Sep 25, 2024
0ad638e
Merge branch 'branch-24.10' into json-tree-refactor
galipremsagar Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ add_library(
src/io/functions.cpp
src/io/json/host_tree_algorithms.cu
src/io/json/json_column.cu
src/io/json/column_tree_construction.cu
src/io/json/json_normalization.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
Expand Down
304 changes: 304 additions & 0 deletions cpp/src/io/json/column_tree_construction.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nested_json.hpp"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/functional>
#include <thrust/for_each.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>
#include <thrust/unique.h>

namespace cudf::io::json {

using row_offset_t = size_type;

#ifdef CSR_DEBUG_PRINT
template <typename T>
void print(device_span<T const> d_vec, std::string name, rmm::cuda_stream_view stream)
vuule marked this conversation as resolved.
Show resolved Hide resolved
{
stream.synchronize();
auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream);
std::cout << name << " = ";
for (auto e : h_vec) {
std::cout << e << " ";
}
std::cout << std::endl;
}
#endif

namespace experimental::detail {

struct level_ordering {
device_span<TreeDepthT const> node_levels;
device_span<NodeIndexT const> col_ids;
device_span<NodeIndexT const> parent_node_ids;
__device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const
{
auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel
? parent_node_sentinel
: col_ids[parent_node_ids[lhs_node_id]];
auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel
? parent_node_sentinel
: col_ids[parent_node_ids[rhs_node_id]];

return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) ||
(node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
lhs_parent_col_id < rhs_parent_col_id) ||
(node_levels[lhs_node_id] == node_levels[rhs_node_id] &&
lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]);
}
};

struct parent_nodeids_to_colids {
device_span<NodeIndexT const> rev_mapped_col_ids;
__device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT
{
return parent_node_id == parent_node_sentinel ? parent_node_sentinel
: rev_mapped_col_ids[parent_node_id];
}
};

/**
* @brief Reduces node tree representation to column tree CSR representation.
*
* @param node_tree Node tree representation of JSON string
* @param original_col_ids Column ids of nodes
* @param row_offsets Row offsets of nodes
* @param is_array_of_arrays Whether the tree is an array of arrays
* @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
* @param stream CUDA stream used for device memory operations and kernel launches
* @return A tuple of column tree representation of JSON string, column ids of columns, and
* max row offsets of columns
*/
std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
tree_meta_t& node_tree,
device_span<NodeIndexT const> original_col_ids,
device_span<NodeIndexT const> sorted_col_ids,
device_span<NodeIndexT const> ordered_node_ids,
device_span<row_offset_t const> row_offsets,
bool is_array_of_arrays,
NodeIndexT row_array_parent_col_id,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();

if (original_col_ids.empty()) {
rmm::device_uvector<NodeIndexT> empty_row_idx(0, stream);
rmm::device_uvector<NodeIndexT> empty_col_idx(0, stream);
rmm::device_uvector<NodeT> empty_column_categories(0, stream);
rmm::device_uvector<row_offset_t> empty_max_row_offsets(0, stream);
rmm::device_uvector<NodeIndexT> empty_mapped_col_ids(0, stream);
return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)},
column_tree_properties{std::move(empty_column_categories),
std::move(empty_max_row_offsets),
std::move(empty_mapped_col_ids)}};
}

auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] =
cudf::io::json::detail::reduce_to_column_tree(node_tree,
original_col_ids,
sorted_col_ids,
ordered_node_ids,
row_offsets,
is_array_of_arrays,
row_array_parent_col_id,
stream);

NodeIndexT num_columns = unpermuted_col_ids.size();

auto mapped_col_ids = cudf::detail::make_device_uvector_async(
unpermuted_col_ids, stream, cudf::get_current_device_resource_ref());
rmm::device_uvector<NodeIndexT> rev_mapped_col_ids(num_columns, stream);
rmm::device_uvector<NodeIndexT> reordering_index(unpermuted_col_ids.size(), stream);

thrust::sequence(
rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end());
// Reorder nodes and column ids in level-wise fashion
thrust::sort_by_key(
rmm::exec_policy_nosync(stream),
reordering_index.begin(),
reordering_index.end(),
mapped_col_ids.begin(),
level_ordering{
unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids});

{
auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async(
mapped_col_ids, stream, cudf::get_current_device_resource_ref());
thrust::sequence(
rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end());
thrust::sort_by_key(rmm::exec_policy_nosync(stream),
mapped_col_ids_copy.begin(),
mapped_col_ids_copy.end(),
rev_mapped_col_ids.begin());
}

rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(),
parent_nodeids_to_colids{rev_mapped_col_ids});
rmm::device_uvector<row_offset_t> max_row_offsets(num_columns, stream);
rmm::device_uvector<NodeT> column_categories(num_columns, stream);
thrust::copy_n(
rmm::exec_policy_nosync(stream),
thrust::make_zip_iterator(thrust::make_permutation_iterator(
unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()),
thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(),
reordering_index.begin()),
thrust::make_permutation_iterator(
unpermuted_tree.node_categories.begin(), reordering_index.begin())),
num_columns,
thrust::make_zip_iterator(
parent_col_ids_it, max_row_offsets.begin(), column_categories.begin()));

#ifdef CSR_DEBUG_PRINT
print<NodeIndexT>(reordering_index, "h_reordering_index", stream);
print<NodeIndexT>(mapped_col_ids, "h_mapped_col_ids", stream);
print<NodeIndexT>(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream);
print<NodeIndexT>(parent_col_ids, "h_parent_col_ids", stream);
print<row_offset_t>(max_row_offsets, "h_max_row_offsets", stream);
#endif

auto construct_row_idx = [&stream](NodeIndexT num_columns,
device_span<NodeIndexT const> parent_col_ids) {
auto row_idx = cudf::detail::make_zeroed_device_uvector_async<NodeIndexT>(
static_cast<std::size_t>(num_columns + 1), stream, cudf::get_current_device_resource_ref());
// Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel)
// children adjacency

auto num_non_leaf_columns = thrust::unique_count(
rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end());
rmm::device_uvector<NodeIndexT> non_leaf_nodes(num_non_leaf_columns, stream);
rmm::device_uvector<NodeIndexT> non_leaf_nodes_children(num_non_leaf_columns, stream);
thrust::reduce_by_key(rmm::exec_policy_nosync(stream),
parent_col_ids.begin() + 1,
parent_col_ids.end(),
thrust::make_constant_iterator(1),
non_leaf_nodes.begin(),
non_leaf_nodes_children.begin(),
thrust::equal_to<TreeDepthT>());

thrust::scatter(rmm::exec_policy_nosync(stream),
non_leaf_nodes_children.begin(),
non_leaf_nodes_children.end(),
non_leaf_nodes.begin(),
row_idx.begin() + 1);

if (num_columns > 1) {
thrust::transform_inclusive_scan(
rmm::exec_policy_nosync(stream),
thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1),
thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()),
row_idx.begin() + 1,
cuda::proclaim_return_type<NodeIndexT>([] __device__(auto a) {
auto n = thrust::get<0>(a);
auto idx = thrust::get<1>(a);
return n == 1 ? idx : idx + 1;
}),
thrust::plus<NodeIndexT>{});
} else {
auto single_node = 1;
row_idx.set_element_async(1, single_node, stream);
}

#ifdef CSR_DEBUG_PRINT
print<NodeIndexT>(row_idx, "h_row_idx", stream);
#endif
return row_idx;
};

auto construct_col_idx = [&stream](NodeIndexT num_columns,
device_span<NodeIndexT const> parent_col_ids,
device_span<NodeIndexT const> row_idx) {
rmm::device_uvector<NodeIndexT> col_idx((num_columns - 1) * 2, stream);
thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1);
// excluding root node, construct scatter map
rmm::device_uvector<NodeIndexT> map(num_columns - 1, stream);
thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
parent_col_ids.begin() + 1,
parent_col_ids.end(),
thrust::make_constant_iterator(1),
map.begin());
thrust::for_each_n(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(1),
num_columns - 1,
[row_idx = row_idx.begin(),
map = map.begin(),
parent_col_ids = parent_col_ids.begin()] __device__(auto i) {
auto parent_col_id = parent_col_ids[i];
if (parent_col_id == 0)
--map[i - 1];
else
map[i - 1] += row_idx[parent_col_id];
});
thrust::scatter(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(1),
thrust::make_counting_iterator(1) + num_columns - 1,
map.begin(),
col_idx.begin());

// Skip the parent of root node
thrust::scatter(rmm::exec_policy_nosync(stream),
parent_col_ids.begin() + 1,
parent_col_ids.end(),
row_idx.begin() + 1,
col_idx.begin());

#ifdef CSR_DEBUG_PRINT
print<NodeIndexT>(col_idx, "h_col_idx", stream);
#endif

return col_idx;
};

/*
5. CSR construction:
a. Sort column levels and get their ordering
b. For each column node coln iterated according to sorted_column_levels; do
i. Find nodes that have coln as the parent node -> set adj_coln
ii. row idx[coln] = size of adj_coln + 1
iii. col idx[coln] = adj_coln U {parent_col_id[coln]}
*/
auto row_idx = construct_row_idx(num_columns, parent_col_ids);
auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx);

return std::tuple{
compressed_sparse_row{std::move(row_idx), std::move(col_idx)},
column_tree_properties{
std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}};
}

} // namespace experimental::detail
} // namespace cudf::io::json
Loading
Loading