Merge branch 'pandas_upgrade' of https://github.com/galipremsagar/cudf …

…into pandas_upgrade
rapidsai · Oct 4, 2024 · 14093d7 · 14093d7
2 parents 5e1c20e + 03749a9
commit 14093d7
Show file tree

Hide file tree

Showing 10 changed files with 225 additions and 188 deletions.
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
@@ -24,14 +24,17 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep
 
-rapids-logger "Install pylibcudf"
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+rapids-logger "Install libcudf, pylibcudf and cudf_polars"
+python -m pip install \
+    -v \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"

diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
@@ -17,9 +17,6 @@ rapids-dependency-file-generator \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
     | tee "${ENV_YAML_DIR}/env.yaml"
 
-rapids-logger "env file"
-cat "${ENV_YAML_DIR}/env.yaml"
-
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
 
 # Temporarily allow unbound variables for conda activation.

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,16 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +37,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::compute_column(table, expression_tree_root);
-  }
-
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);

diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
-#include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -33,23 +32,21 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns>
-class BINARYOP : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns>
-static void BM_binaryop_transform(benchmark::State& state)
+static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table = create_sequence_table(
     cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
   cudf::table_view table{*source_table};
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
     auto const op               = cudf::binary_operator::ADD;
     auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
@@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
-  }
-
-  // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
+  });
 }
 
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
-  BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns)     \
-  (::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
+                                                                                      \
+  static void name(::nvbench::state& st)                                              \
+  {                                                                                   \
+    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+  }                                                                                   \
+  NVBENCH_BENCH(name)                                                                 \
+    .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -15,20 +15,18 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-class COMPILED_BINARYOP : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   // Call once for hot cache.
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-
   // use number of bytes read and written to global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
+  state.add_global_memory_reads<TypeLhs>(table_size);
+  state.add_global_memory_reads<TypeRhs>(table_size);
+  state.add_global_memory_reads<TypeOut>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
 }
 
+#define BM_STRINGIFY(a) #a
+
 // TODO tparam boolean for null.
-#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)           \
-  BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name)                             \
-  (::benchmark::State & st)                                               \
-  {                                                                       \
-    BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                           \
-    ->Unit(benchmark::kMicrosecond)                                       \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)               \
+  static void name(::nvbench::state& st)                                      \
+  {                                                                           \
+    ::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
+  }                                                                           \
+  NVBENCH_BENCH(name)                                                         \
+    .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
+    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
@@ -86,14 +86,28 @@ class datasource {
   /**
    * @brief Creates a source from a file path.
    *
+   * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the
+   * `datasource` implementation about the expected range of the data that will be read. The
+   * implementation may use these hints to optimize the read operation. These parameters are usually
+   * based on the byte range option. In this case, `min_size_estimate` should be no greater than the
+   * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can
+   * include padding after the byte range, to include additional data that may be needed for
+   * processing.
+   *
+   @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate
+   *
    * @param[in] filepath Path to the file to use
-   * @param[in] offset Bytes from the start of the file (the default is zero)
-   * @param[in] size Bytes from the offset; use zero for entire file (the default is zero)
+   * @param[in] offset Starting byte offset from which data will be read (the default is zero)
+   * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
+   * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(std::string const& filepath,
-                                            size_t offset = 0,
-                                            size_t size   = 0);
+                                            size_t offset            = 0,
+                                            size_t max_size_estimate = 0,
+                                            size_t min_size_estimate = 0);
 
   /**
    * @brief Creates a source from a host memory buffer.

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
@@ -122,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 namespace {
 
 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
-                                                                    size_t range_offset = 0,
-                                                                    size_t range_size   = 0)
+                                                                    size_t offset            = 0,
+                                                                    size_t max_size_estimate = 0,
+                                                                    size_t min_size_estimate = 0)
 {
   switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
       for (auto const& filepath : info.filepaths()) {
-        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+        sources.emplace_back(
+          cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate));
       }
       return sources;
     }
@@ -211,7 +213,8 @@ table_with_metadata read_json(json_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   return json::detail::read_json(datasources, options, stream, mr);
 }
@@ -238,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");