Skip to content

Commit

Permalink
Fix issue with no decompressed data in ORC reader (#13609)
Browse files Browse the repository at this point in the history
Currently, the ORC reader assumes that if data in the stripes of the current level are not empty, the decompressed data will also not be empty.
However, there is a corner case where the stripe is empty, but data blocks are still compressed so they contain the compression header. In this case, decompressed data is empty even with non-empty compressed blocks. 

This PR removes the assertion in the reader to allow for this corner case. Also adds a short-circuit to `decompress_stripe_data` to return early if the decompressed data is empty.

Issue #13608

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #13609
  • Loading branch information
vuule authored Jun 26, 2023
1 parent 909cc0b commit 042e0a3
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
6 changes: 5 additions & 1 deletion cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,16 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
total_decomp_size += compinfo[i].max_uncompressed_size;
}
CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
CUDF_EXPECTS(
not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
"Inconsistent info on compression blocks");

// Buffer needs to be padded.
// Required by `gpuDecodeOrcColumnData`.
rmm::device_buffer decomp_data(
cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
if (decomp_data.size() == 0) { return decomp_data; }

rmm::device_uvector<device_span<uint8_t const>> inflate_in(
num_compressed_blocks + num_uncompressed_blocks, stream);
rmm::device_uvector<device_span<uint8_t>> inflate_out(
Expand Down
Binary file not shown.
16 changes: 12 additions & 4 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,6 @@ def test_orc_writer_decimal(tmpdir, scale, decimal_type):

@pytest.mark.parametrize("num_rows", [1, 100, 3000])
def test_orc_reader_multiple_files(datadir, num_rows):

path = datadir / "TestOrcFile.testSnappy.orc"

df_1 = pd.read_orc(path)
Expand All @@ -939,7 +938,6 @@ def test_orc_reader_multiple_files(datadir, num_rows):


def test_orc_reader_multi_file_single_stripe(datadir):

path = datadir / "TestOrcFile.testSnappy.orc"

# should raise an exception
Expand All @@ -948,7 +946,6 @@ def test_orc_reader_multi_file_single_stripe(datadir):


def test_orc_reader_multi_file_multi_stripe(datadir):

path = datadir / "TestOrcFile.testStripeLevelStats.orc"
gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]])
pdf = pd.read_orc(path)
Expand Down Expand Up @@ -1100,7 +1097,6 @@ def list_struct_buff():
@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
@pytest.mark.parametrize("use_index", [True, False])
def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):

gdf = cudf.read_orc(
list_struct_buff,
columns=columns,
Expand Down Expand Up @@ -1905,3 +1901,15 @@ def test_reader_row_index_order(data):
expected.to_pandas().to_orc(buffer)
got = cudf.read_orc(buffer)
assert_eq(expected, got)


# Test the corner case where empty blocks are compressed
# Decompressed data size is zero, even though compressed data size is non-zero
# For more information see https://github.com/rapidsai/cudf/issues/13608
def test_orc_reader_empty_decomp_data(datadir):
path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc"

expect = pd.read_orc(path)
got = cudf.read_orc(path)

assert_eq(expect, got)

0 comments on commit 042e0a3

Please sign in to comment.