Fix issue with no decompressed data in ORC reader (#13609)

Currently, the ORC reader assumes that if data in the stripes of the current level are not empty, the decompressed data will also not be empty. However, there is a corner case where the stripe is empty, but data blocks are still compressed so they contain the compression header. In this case, decompressed data is empty even with non-empty compressed blocks. This PR removes the assertion in the reader to allow for this corner case. Also adds a short-circuit to `decompress_stripe_data` to return early if the decompressed data is empty. Issue #13608 Authors: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: #13609
rapidsai · Jun 26, 2023 · 042e0a3 · 042e0a3
1 parent 909cc0b
commit 042e0a3
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 5 deletions.
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -317,12 +317,16 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
     total_decomp_size += compinfo[i].max_uncompressed_size;
   }
-  CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
 
   // Buffer needs to be padded.
   // Required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.size() == 0) { return decomp_data; }
+
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> inflate_out(

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -921,7 +921,6 @@ def test_orc_writer_decimal(tmpdir, scale, decimal_type):
 
 @pytest.mark.parametrize("num_rows", [1, 100, 3000])
 def test_orc_reader_multiple_files(datadir, num_rows):
-
     path = datadir / "TestOrcFile.testSnappy.orc"
 
     df_1 = pd.read_orc(path)
@@ -939,7 +938,6 @@ def test_orc_reader_multiple_files(datadir, num_rows):
 
 
 def test_orc_reader_multi_file_single_stripe(datadir):
-
     path = datadir / "TestOrcFile.testSnappy.orc"
 
     # should raise an exception
@@ -948,7 +946,6 @@ def test_orc_reader_multi_file_single_stripe(datadir):
 
 
 def test_orc_reader_multi_file_multi_stripe(datadir):
-
     path = datadir / "TestOrcFile.testStripeLevelStats.orc"
     gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]])
     pdf = pd.read_orc(path)
@@ -1100,7 +1097,6 @@ def list_struct_buff():
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
-
     gdf = cudf.read_orc(
         list_struct_buff,
         columns=columns,
@@ -1905,3 +1901,15 @@ def test_reader_row_index_order(data):
     expected.to_pandas().to_orc(buffer)
     got = cudf.read_orc(buffer)
     assert_eq(expected, got)
+
+
+# Test the corner case where empty blocks are compressed
+# Decompressed data size is zero, even though compressed data size is non-zero
+# For more information see https://github.com/rapidsai/cudf/issues/13608
+def test_orc_reader_empty_decomp_data(datadir):
+    path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc"
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_eq(expect, got)