From 32130c2332a3bbd5b418f7963f02cf0280c2940a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 30 Jul 2024 10:46:27 +0200 Subject: [PATCH 01/88] clean up cmake RemoteHandle::read() remove StreamFuture(StreamFuture&&) python bindings and tests read to device memory dependencies: aws-sdk-cpp parse_s3_path cmake: adding AWSSDK to python built RemoteFile.from_url() benchmark benchmark: --api numpy ibenchmark: --api cudf cleanup benchmark: adding --api cudf-fsspec read(): use PushAndPopContext impl. pread ensure_aws_s3_api_is_initalized AwsS3Client S3Context clean up SameThreadExecutor use shared pointer cmake: AWSSDK COMPONENTS s3 transfer create_transfer_manager clean up BufferAsStream test_read: larger data size KVIKIO_NVTX_FUNC_RANGE benchmark: use random RemoteHandle::read_to_host(): print bandwidth benchmark clean up remove the use of the transfer module ci: some more aws-sdk-cpp benchmark: pytest.importorskip("moto") remote_handle.pyx remote_file.py make remote IO optional don't use typing_extensions dependencies: boto3 and moto more aws-sdk-cpp trigger CI error if remote module wasn't built --- .../all_cuda-118_arch-aarch64.yaml | 3 + .../all_cuda-118_arch-x86_64.yaml | 3 + .../all_cuda-125_arch-aarch64.yaml | 3 + .../all_cuda-125_arch-x86_64.yaml | 3 + conda/recipes/kvikio/meta.yaml | 3 + conda/recipes/libkvikio/meta.yaml | 8 + cpp/CMakeLists.txt | 7 + cpp/examples/CMakeLists.txt | 4 + cpp/include/kvikio/file_handle.hpp | 10 +- cpp/include/kvikio/remote_handle.hpp | 264 ++++++++++++++++++ dependencies.yaml | 9 +- python/kvikio/kvikio/__init__.py | 9 +- python/kvikio/kvikio/_lib/CMakeLists.txt | 9 +- python/kvikio/kvikio/_lib/remote_handle.pyx | 78 ++++++ python/kvikio/kvikio/benchmarks/aws_s3_io.py | 230 +++++++++++++++ python/kvikio/kvikio/remote_file.py | 57 ++++ python/kvikio/pyproject.toml | 2 + python/kvikio/tests/test_aws_s3.py | 137 +++++++++ python/kvikio/tests/test_benchmarks.py | 46 +++ 19 files changed, 872 insertions(+), 13 deletions(-) create mode 100644 cpp/include/kvikio/remote_handle.hpp create mode 100644 python/kvikio/kvikio/_lib/remote_handle.pyx create mode 100644 python/kvikio/kvikio/benchmarks/aws_s3_io.py create mode 100644 python/kvikio/kvikio/remote_file.py create mode 100644 python/kvikio/tests/test_aws_s3.py diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index c6e5314b17..82790a443d 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -6,6 +6,8 @@ channels: - conda-forge - nvidia dependencies: +- aws-sdk-cpp +- boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 - cuda-python>=11.7.1,<12.0a0 @@ -17,6 +19,7 @@ dependencies: - dask>=2022.05.2 - doxygen=1.9.1 - gcc_linux-aarch64=11.* +- moto>=4.0.8 - ninja - numcodecs <0.12.0 - numpy>=1.23,<2.0a0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 526e7155ef..cce77c120f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -6,6 +6,8 @@ channels: - conda-forge - nvidia dependencies: +- aws-sdk-cpp +- boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 - cuda-python>=11.7.1,<12.0a0 @@ -19,6 +21,7 @@ dependencies: - gcc_linux-64=11.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 +- moto>=4.0.8 - ninja - numcodecs <0.12.0 - numpy>=1.23,<2.0a0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 8a9b368aaa..6500ccfea3 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -6,6 +6,8 @@ channels: - conda-forge - nvidia dependencies: +- aws-sdk-cpp +- boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 - cuda-nvcc @@ -18,6 +20,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-aarch64=11.* - libcufile-dev +- moto>=4.0.8 - ninja - numcodecs <0.12.0 - numpy>=1.23,<2.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 3a1ed63b16..f5942cc6f6 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -6,6 +6,8 @@ channels: - conda-forge - nvidia dependencies: +- aws-sdk-cpp +- boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 - cuda-nvcc @@ -18,6 +20,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-64=11.* - libcufile-dev +- moto>=4.0.8 - ninja - numcodecs <0.12.0 - numpy>=1.23,<2.0a0 diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml index 247be31cf7..504a38b0cf 100644 --- a/conda/recipes/kvikio/meta.yaml +++ b/conda/recipes/kvikio/meta.yaml @@ -52,6 +52,7 @@ requirements: - {{ compiler('cuda') }} {% endif %} - {{ stdlib("c") }} + - aws-sdk-cpp host: - python - pip @@ -64,11 +65,13 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - libkvikio ={{ version }} + - aws-sdk-cpp run: - python - numpy >=1.23,<2.0a0 - cupy >=12.0.0 - zarr + - aws-sdk-cpp # See https://github.com/zarr-developers/numcodecs/pull/475 - numcodecs <0.12.0 - packaging diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 186c373f56..0c97e01abe 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -43,6 +43,7 @@ requirements: {% endif %} - ninja - {{ stdlib("c") }} + - aws-sdk-cpp host: - cuda-version ={{ cuda_version }} {% if cuda_major == "11" %} @@ -52,6 +53,7 @@ requirements: {% else %} - libcufile-dev # [linux] {% endif %} + - aws-sdk-cpp outputs: - name: libkvikio @@ -72,8 +74,10 @@ outputs: requirements: build: - cmake {{ cmake_version }} + - aws-sdk-cpp host: - cuda-version ={{ cuda_version }} + - aws-sdk-cpp run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} @@ -83,6 +87,7 @@ outputs: {% else %} - libcufile-dev # [linux] {% endif %} + - aws-sdk-cpp test: commands: - test -f $PREFIX/include/kvikio/file_handle.hpp @@ -106,6 +111,7 @@ outputs: - cuda-cudart-dev - libcufile-dev # [linux] {% endif %} + - aws-sdk-cpp requirements: build: - cmake {{ cmake_version }} @@ -118,6 +124,7 @@ outputs: - cuda-cudart-dev - libcufile-dev # [linux] {% endif %} + - aws-sdk-cpp run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} @@ -127,6 +134,7 @@ outputs: - cuda-cudart - libcufile # [linux] {% endif %} + - aws-sdk-cpp about: home: https://rapids.ai license: Apache-2.0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f4f3f13109..646e676297 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -55,6 +55,12 @@ rapids_find_package( INSTALL_EXPORT_SET kvikio-exports ) +rapids_find_package( + AWSSDK COMPONENTS s3 + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports +) + rapids_find_package( cuFile BUILD_EXPORT_SET kvikio-exports @@ -130,6 +136,7 @@ target_include_directories( ) target_link_libraries( kvikio INTERFACE Threads::Threads ${CMAKE_DL_LIBS} nvtx3::nvtx3-cpp BS::thread_pool + ${AWSSDK_LINK_LIBRARIES} ) target_compile_features(kvikio INTERFACE cxx_std_17) diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt index c12ddb2e52..284590e943 100644 --- a/cpp/examples/CMakeLists.txt +++ b/cpp/examples/CMakeLists.txt @@ -14,6 +14,8 @@ set(TEST_INSTALL_PATH bin/tests/libkvikio) +# Example: basic_io + if(CUDAToolkit_FOUND) add_executable(BASIC_IO_TEST basic_io.cpp) set_target_properties(BASIC_IO_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") @@ -35,6 +37,8 @@ else() message(STATUS "Cannot build the basic_io example when CUDA is not found") endif() +# Example: basic_no_cuda + add_executable(BASIC_NO_CUDA_TEST basic_no_cuda.cpp) set_target_properties(BASIC_NO_CUDA_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") target_include_directories(BASIC_NO_CUDA_TEST PRIVATE ../include) diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index 80779c5282..cef5f4ed1f 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -144,7 +144,7 @@ class FileHandle { bool _initialized{false}; bool _compat_mode{false}; mutable std::size_t _nbytes{0}; // The size of the underlying file, zero means unknown. - CUfileHandle_t _handle{}; + CUfileHandle_t _handle{nullptr}; public: static constexpr mode_t m644 = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH; @@ -208,7 +208,7 @@ class FileHandle { _initialized{std::exchange(o._initialized, false)}, _compat_mode{std::exchange(o._compat_mode, false)}, _nbytes{std::exchange(o._nbytes, 0)}, - _handle{std::exchange(o._handle, CUfileHandle_t{})} + _handle{std::exchange(o._handle, CUfileHandle_t{nullptr})} { } FileHandle& operator=(FileHandle&& o) noexcept @@ -218,7 +218,7 @@ class FileHandle { _initialized = std::exchange(o._initialized, false); _compat_mode = std::exchange(o._compat_mode, false); _nbytes = std::exchange(o._nbytes, 0); - _handle = std::exchange(o._handle, CUfileHandle_t{}); + _handle = std::exchange(o._handle, CUfileHandle_t{nullptr}); return *this; } ~FileHandle() noexcept { close(); } @@ -232,8 +232,8 @@ class FileHandle { { if (closed()) { return; } - if (!_compat_mode) { cuFileAPI::instance().HandleDeregister(_handle); } - ::close(_fd_direct_off); + if (_handle != nullptr) { cuFileAPI::instance().HandleDeregister(_handle); } + if (_fd_direct_off != -1) { ::close(_fd_direct_off); } if (_fd_direct_on != -1) { ::close(_fd_direct_on); } _fd_direct_on = -1; _fd_direct_off = -1; diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp new file mode 100644 index 0000000000..78f21d3024 --- /dev/null +++ b/cpp/include/kvikio/remote_handle.hpp @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +using namespace std::chrono; + +namespace kvikio { +namespace detail { + +/** + * Stream implementation of a fixed size buffer + */ +class BufferAsStream : public Aws::IOStream { + public: + using Base = Aws::IOStream; + explicit BufferAsStream(std::streambuf* buf) : Base(buf) {} + + ~BufferAsStream() override = default; +}; + +class S3Context { + public: + S3Context() : _client{S3Context::create_client()} {} + + Aws::S3::S3Client& client() { return *_client; } + + static S3Context& default_context() + { + static S3Context _default_context; + return _default_context; + } + + S3Context(S3Context const&) = delete; + void operator=(S3Context const&) = delete; + + private: + static void ensure_aws_s3_api_init() + { + static bool not_initalized{true}; + if (not_initalized) { + std::cout << "ensure_aws_s3_api_initalized INIT" << std::endl; + not_initalized = false; + + Aws::SDKOptions options; + // options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Error; + Aws::InitAPI(options); // Should only be called once. + } + } + + static std::shared_ptr create_client() + { + S3Context::ensure_aws_s3_api_init(); + + Aws::Client::ClientConfiguration clientConfig; + // Optional: Set to the AWS Region (overrides config file). + // clientConfig.region = "us-east-1"; + + const char* endpointOverride = getenv("AWS_ENDPOINT_URL"); + if (endpointOverride != nullptr) { clientConfig.endpointOverride = endpointOverride; } + + // You don't normally have to test that you are authenticated. But the S3 service permits + // anonymous requests, thus the s3Client will return "success" even if you are + // unauthenticated, which can be confusing to a new user. + auto provider = Aws::MakeShared("alloc-tag"); + auto creds = provider->GetAWSCredentials(); + if (creds.IsEmpty()) { + throw std::runtime_error(std::string("Failed authentication to ") + endpointOverride); + } + auto ret = std::make_shared(Aws::S3::S3Client(clientConfig)); + + // Try the connection + auto outcome = ret->ListBuckets(); + if (!outcome.IsSuccess()) { + throw std::runtime_error(std::string("S3 error: ") + outcome.GetError().GetMessage()); + } + return ret; + } + + std::shared_ptr _client; +}; + +inline std::size_t get_s3_file_size(const std::string& bucket_name, const std::string& object_name) +{ + KVIKIO_NVTX_FUNC_RANGE(); + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(bucket_name.c_str()); + req.SetKey(object_name.c_str()); + Aws::S3::Model::HeadObjectOutcome outcome = S3Context::default_context().client().HeadObject(req); + if (!outcome.IsSuccess()) { + const Aws::S3::S3Error& err = outcome.GetError(); + throw std::invalid_argument("get_s3_file_size(): " + err.GetExceptionName() + ": " + + err.GetMessage()); + } + return outcome.GetResult().GetContentLength(); +} + +inline std::pair parse_s3_path(const std::string& path) +{ + if (path.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } + if (path.size() < 5 || path.substr(0, 5) != "s3://") { + throw std::invalid_argument("The remote path must start with the S3 scheme (\"s3://\")."); + } + std::string p = path.substr(5); + if (p.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } + size_t pos = p.find_first_of('/'); + if (pos == 0) { throw std::invalid_argument("The remote path does not contain a bucket name."); } + return std::make_pair(p.substr(0, pos), (pos == std::string::npos) ? "" : p.substr(pos + 1)); +} + +} // namespace detail + +/** + * @brief Handle of + * + * At the moment, only AWS S3 is the supported + */ +class RemoteHandle { + private: + std::string _bucket_name{}; + std::string _object_name{}; + std::size_t _nbytes{}; + + public: + RemoteHandle() noexcept = default; + + RemoteHandle(std::string bucket_name, std::string object_name) + : _bucket_name(std::move(bucket_name)), + _object_name(std::move(object_name)), + _nbytes(detail::get_s3_file_size(_bucket_name, _object_name)) + { + std::cout << "RemoteHandle() - bucket_name: " << _bucket_name + << ", object_name: " << _object_name << ", nbytes: " << _nbytes << std::endl; + } + + RemoteHandle(const std::string& remote_path) + { + auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); + _bucket_name = std::move(bucket_name); + _object_name = std::move(object_name); + _nbytes = detail::get_s3_file_size(_bucket_name, _object_name); + + std::cout << "RemoteHandle() - remote_path: " << remote_path + << ", bucket_name: " << _bucket_name << ", object_name: " << _object_name + << ", nbytes: " << _nbytes << std::endl; + } + + /** + * @brief Get the file size + * + * @return The number of bytes + */ + [[nodiscard]] inline std::size_t nbytes() const { return _nbytes; } + + std::size_t read_to_host(void* buf, std::size_t size, std::size_t file_offset = 0) + { + KVIKIO_NVTX_FUNC_RANGE("AWS S3 receive", size); + auto t0 = high_resolution_clock::now(); + + auto& default_context = detail::S3Context::default_context(); + Aws::S3::Model::GetObjectRequest req; + req.SetBucket(_bucket_name.c_str()); + req.SetKey(_object_name.c_str()); + const std::string byte_range = + "bytes=" + std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); + req.SetRange(byte_range.c_str()); + + // To write directly to `buf`, we register a "factory" that wraps a buffer as a output stream. + Aws::Utils::Stream::PreallocatedStreamBuf buf_stream(static_cast(buf), size); + req.SetResponseStreamFactory( + [&]() { return Aws::New("BufferAsStream", &buf_stream); }); + + Aws::S3::Model::GetObjectOutcome outcome = default_context.client().GetObject(req); + if (!outcome.IsSuccess()) { + const Aws::S3::S3Error& err = outcome.GetError(); + throw std::runtime_error(err.GetExceptionName() + ": " + err.GetMessage()); + } + const std::size_t n = outcome.GetResult().GetContentLength(); + if (n != size) { + throw std::runtime_error("S3 read of " + std::to_string(size) + " bytes failed, received " + + std::to_string(n) + " bytes"); + } + auto t1 = high_resolution_clock::now(); + float duration = size / (duration_cast(t1 - t0).count() / 1000000.0); + + std::cout << "RemoteHandle::read_to_host() - buf: " << buf << ", size: " << size + << ", file_offset: " << file_offset << ", bw: " << duration / (2 << 20) << " MiB/s" + << std::endl; + return n; + } + + std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0) + { + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::read()", size); + if (is_host_memory(buf)) { return read_to_host(buf, size, file_offset); } + + CUcontext ctx = get_context_from_pointer(buf); + PushAndPopContext c(ctx); + + auto alloc = detail::AllocRetain::instance().get(); // Host memory allocation + CUdeviceptr devPtr = convert_void2deviceptr(buf); + CUstream stream = detail::StreamsByThread::get(); + + std::size_t cur_file_offset = convert_size2off(file_offset); + std::size_t byte_remaining = convert_size2off(size); + + while (byte_remaining > 0) { + const std::size_t nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); + std::size_t nbytes_got = nbytes_requested; + nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); + CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream)); + CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); + cur_file_offset += nbytes_got; + devPtr += nbytes_got; + byte_remaining -= nbytes_got; + } + return size; + } + + std::future pread(void* buf, std::size_t size, std::size_t file_offset = 0) + { + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::pread()", size); + std::cout << "RemoteHandle::pread()" << std::endl; + auto task = [this](void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset) -> std::size_t { + return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); + }; + return parallel_io(task, buf, size, file_offset, posix_bounce_buffer_size, 0); + } +}; + +} // namespace kvikio diff --git a/dependencies.yaml b/dependencies.yaml index b3617388ce..2dfd954af1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -106,6 +106,7 @@ dependencies: packages: - c-compiler - cxx-compiler + - aws-sdk-cpp specific: - output_types: conda matrices: @@ -319,6 +320,8 @@ dependencies: - &dask dask>=2022.05.2 - pytest - pytest-cov + - boto3>=1.21.21 + - moto>=4.0.8 specific: - output_types: [conda, requirements, pyproject] matrices: @@ -329,9 +332,3 @@ dependencies: - matrix: # All CUDA 11 versions packages: - cuda-python>=11.7.1,<12.0a0 - test_python_legate: - common: - - output_types: [conda, requirements, pyproject] - packages: - - *dask - - distributed>=2022.05.2 diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py index d31d308916..e3bb0ffd37 100644 --- a/python/kvikio/kvikio/__init__.py +++ b/python/kvikio/kvikio/__init__.py @@ -4,6 +4,7 @@ from kvikio._lib import buffer, driver_properties # type: ignore from kvikio._version import __git_commit__, __version__ from kvikio.cufile import CuFile +from kvikio.remote_file import RemoteFile, is_remote_file_available def memory_register(buf) -> None: @@ -18,4 +19,10 @@ def memory_deregister(buf) -> None: DriverProperties = driver_properties.DriverProperties -__all__ = ["__git_commit__", "__version__", "CuFile"] +__all__ = [ + "__git_commit__", + "__version__", + "CuFile", + "RemoteFile", + "is_remote_file_available", +] diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index c77d8e3df1..2eec2d2668 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -17,8 +17,15 @@ set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_ha libnvcomp.pyx libnvcomp_ll.pyx ) +if(AWSSDK_FOUND) + message(STATUS "Building remote_handle.pyx (aws-cpp-sdk-s3 found)") + list(APPEND cython_modules remote_handle.pyx) +else() + message(WARNING "Skipping remote_handle.pyx (aws-cpp-sdk-s3 not found)") +endif() + rapids_cython_create_modules( CXX SOURCE_FILES "${cython_modules}" - LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp + LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp ${AWSSDK_LINK_LIBRARIES} ) diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx new file mode 100644 index 0000000000..1fa436b01c --- /dev/null +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +# distutils: language = c++ +# cython: language_level=3 + +from typing import Optional + +from libc.stdint cimport uintptr_t +from libcpp.string cimport string +from libcpp.utility cimport pair + +from kvikio._lib.arr cimport parse_buffer_argument +from kvikio._lib.future cimport IOFuture, _wrap_io_future, future + + +cdef extern from "" namespace "kvikio" nogil: + cdef cppclass RemoteHandle: + RemoteHandle() except + + RemoteHandle( + string bucket_name, + string object_name, + ) except + + RemoteHandle( + string remote_path, + ) except + + int nbytes() + size_t read( + void* buf, + size_t size, + size_t file_offset + ) except + + future[size_t] pread( + void* buf, + size_t size, + size_t file_offset + ) except + + + +cdef class RemoteFile: + """ Remote file handle""" + cdef RemoteHandle _handle + + @classmethod + def from_bucket_and_object(cls, bucket_name: str, object_name: str): + cdef RemoteFile ret = RemoteFile() + ret._handle = RemoteHandle( + str.encode(str(bucket_name)), + str.encode(str(object_name)), + ) + return ret + + @classmethod + def from_url(cls, url: str): + cdef RemoteFile ret = RemoteFile() + ret._handle = RemoteHandle(str.encode(str(url))) + return ret + + def nbytes(self) -> int: + return self._handle.nbytes() + + def read(self, buf, size: Optional[int], file_offset: int) -> int: + cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) + return self._handle.read( + info.first, + info.second, + file_offset, + ) + + def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture: + cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) + return _wrap_io_future( + self._handle.pread( + info.first, + info.second, + file_offset, + ) + ) diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py new file mode 100644 index 0000000000..3c9d32e6c7 --- /dev/null +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -0,0 +1,230 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import argparse +import contextlib +import multiprocessing +import os +import socket +import statistics +import sys +import time +from functools import partial +from typing import ContextManager +from urllib.parse import urlparse + +import boto3 +import cupy +import numpy +from dask.utils import format_bytes + +import kvikio +import kvikio.defaults + + +def get_local_port() -> int: + """Return an available port""" + sock = socket.socket() + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + return port + + +def start_s3_server(lifetime=3600): + from moto.server import ThreadedMotoServer + + # Silence the activity info from ThreadedMotoServer + sys.stderr = open("/dev/null", "w") + url = urlparse(os.environ["AWS_ENDPOINT_URL"]) + server = ThreadedMotoServer(ip_address=url.hostname, port=url.port) + server.start() + time.sleep(lifetime) + + +@contextlib.contextmanager +def local_s3_server(): + # Use fake aws credentials + os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" + os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + p = multiprocessing.Process(target=start_s3_server) + p.start() + yield + p.kill() + + +def create_client_and_bucket(): + client = boto3.client("s3", endpoint_url=os.getenv("AWS_ENDPOINT_URL", None)) + try: + client.create_bucket(Bucket=args.bucket, ACL="public-read-write") + except ( + client.exceptions.BucketAlreadyOwnedByYou, + client.exceptions.BucketAlreadyExists, + ): + pass + except Exception: + print( + "Problem accessing the S3 server? using wrong credentials? Try setting " + "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and/or AWS_ENDPOINT_URL. " + "Alternatively, use the bundled server `--use-bundled-server`\n", + file=sys.stderr, + flush=True, + ) + raise + return client + + +def run_numpy_like(args, xp): + # Upload data to S3 server + data = numpy.arange(args.nelem, dtype=args.dtype) + recv = xp.empty_like(data) + + client = create_client_and_bucket() + client.put_object(Bucket=args.bucket, Key="data1", Body=bytes(data)) + + def run() -> float: + t0 = time.perf_counter() + with kvikio.RemoteFile(bucket_name=args.bucket, object_name="data1") as f: + res = f.read(recv) + t1 = time.perf_counter() + assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}" + xp.testing.assert_array_equal(data, recv) + return t1 - t0 + + for _ in range(args.nruns): + yield run() + + +def run_cudf(args, use_kvikio_s3): + import cudf + + # Upload data to S3 server + create_client_and_bucket() + data = cupy.random.rand(args.nelem).astype(args.dtype) + df = cudf.DataFrame({"a": data}) + df.to_parquet(f"s3://{args.bucket}/data1") + + def run() -> float: + t0 = time.perf_counter() + cudf.read_parquet(f"s3://{args.bucket}/data1", use_kvikio_s3=use_kvikio_s3) + t1 = time.perf_counter() + return t1 - t0 + + for _ in range(args.nruns): + yield run() + + +API = { + "cupy-kvikio": partial(run_numpy_like, xp=cupy), + "numpy-kvikio": partial(run_numpy_like, xp=numpy), + "cudf-kvikio": partial(run_cudf, use_kvikio_s3=True), + "cudf-fsspec": partial(run_cudf, use_kvikio_s3=False), +} + + +def main(args): + cupy.cuda.set_allocator(None) # Disable CuPy's default memory pool + cupy.arange(10) # Make sure CUDA is initialized + + kvikio.defaults.num_threads_reset(args.nthreads) + print("Roundtrip benchmark") + print("--------------------------------------") + print(f"nelem | {args.nelem} ({format_bytes(args.nbytes)})") + print(f"dtype | {args.dtype}") + print(f"nthreads | {args.nthreads}") + print(f"nruns | {args.nruns}") + print(f"server | {os.getenv('AWS_ENDPOINT_URL', 'http://*.amazonaws.com')}") + if args.use_bundled_server: + print("--------------------------------------") + print("Using the bundled local server is slow") + print("and can be misleading. Consider using") + print("a local MinIO or officel S3 server.") + print("======================================") + + # Run each benchmark using the requested APIs + for api in args.api: + res = [] + for elapsed in API[api](args): + res.append(elapsed) + + def pprint_api_res(name, samples): + samples = [args.nbytes / s for s in samples] # Convert to throughput + mean = statistics.mean(samples) if len(samples) > 1 else samples[0] + ret = f"{api}-{name}".ljust(18) + ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14) + if len(samples) > 1: + stdev = statistics.stdev(samples) / mean * 100 + ret += " ± %5.2f %%" % stdev + ret += " (" + for sample in samples: + ret += f"{format_bytes(sample)}/s, " + ret = ret[:-2] + ")" # Replace trailing comma + return ret + + print(pprint_api_res("read", res)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Roundtrip benchmark") + parser.add_argument( + "-n", + "--nelem", + metavar="NELEM", + default="1024", + type=int, + help="Number of elements (default: %(default)s).", + ) + parser.add_argument( + "--dtype", + metavar="DATATYPE", + default="float32", + type=numpy.dtype, + help="The data type of each element (default: %(default)s).", + ) + parser.add_argument( + "--nruns", + metavar="RUNS", + default=1, + type=int, + help="Number of runs per API (default: %(default)s).", + ) + parser.add_argument( + "-t", + "--nthreads", + metavar="THREADS", + default=1, + type=int, + help="Number of threads to use (default: %(default)s).", + ) + parser.add_argument( + "--use-bundled-server", + action="store_true", + help="Launch and use a local slow S3 server (ThreadedMotoServer).", + ) + parser.add_argument( + "--bucket", + metavar="NAME", + default="kvikio-s3-benchmark", + type=str, + help="Name of the AWS S3 bucket to use (default: %(default)s).", + ) + parser.add_argument( + "--api", + metavar="API", + default=list(API.keys())[0], # defaults to the first API + nargs="+", + choices=tuple(API.keys()) + ("all",), + help="List of APIs to use {%(choices)s} (default: %(default)s).", + ) + args = parser.parse_args() + args.nbytes = args.nelem * args.dtype.itemsize + if "all" in args.api: + args.api = tuple(API.keys()) + + ctx: ContextManager = contextlib.nullcontext() + if args.use_bundled_server: + os.environ["AWS_ENDPOINT_URL"] = f"http://127.0.0.1:{get_local_port()}" + ctx = local_s3_server() + with ctx: + main(args) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py new file mode 100644 index 0000000000..00c788b6b0 --- /dev/null +++ b/python/kvikio/kvikio/remote_file.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +from __future__ import annotations + +from typing import Optional + +from kvikio.cufile import IOFuture + + +def is_remote_file_available() -> bool: + try: + import kvikio._lib.remote_handle # noqa: F401 + except ImportError: + return False + else: + return True + + +def _get_remote_remote_file_class(): + if not is_remote_file_available(): + raise RuntimeError( + "RemoteFile not available, please build KvikIO with AWS S3 support" + ) + import kvikio._lib.remote_handle + + return kvikio._lib.remote_handle.RemoteFile + + +class RemoteFile: + """File handle of a remote file""" + + def __init__(self, bucket_name: str, object_name: str): + self._handle = _get_remote_remote_file_class().from_bucket_and_object( + bucket_name, object_name + ) + + @classmethod + def from_url(cls, url: str) -> RemoteFile: + ret = object.__new__(cls) + ret._handle = _get_remote_remote_file_class().from_url(url) + return ret + + def __enter__(self) -> RemoteFile: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + pass + + def nbytes(self) -> int: + return self._handle.nbytes() + + def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: + return IOFuture(self._handle.pread(buf, size, file_offset)) + + def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: + return self.pread(buf, size, file_offset).get() diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index 3f68177280..9ceb8be8f6 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -38,8 +38,10 @@ classifiers = [ [project.optional-dependencies] test = [ + "boto3>=1.21.21", "cuda-python>=11.7.1,<12.0a0", "dask>=2022.05.2", + "moto>=4.0.8", "pytest", "pytest-cov", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py new file mode 100644 index 0000000000..04e0d08c0a --- /dev/null +++ b/python/kvikio/tests/test_aws_s3.py @@ -0,0 +1,137 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import multiprocessing as mp +import os +import socket +import time +from contextlib import contextmanager + +import pytest + +import kvikio + +# TODO: remove before PR merge. Trigger CI error if the remote module wasn't built +import kvikio._lib.remote_handle # isort: skip + +moto = pytest.importorskip("moto", minversion="3.1.6") +boto3 = pytest.importorskip("boto3") + +if not kvikio.is_remote_file_available(): + pytest.skip( + "cannot test remote IO, please build KvikIO with with AWS S3 support", + allow_module_level=True, + ) + +ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer + + +@pytest.fixture(scope="session") +def endpoint_ip(): + return "127.0.0.1" + + +@pytest.fixture(scope="session") +def endpoint_port(): + # Return a free port per worker session. + sock = socket.socket() + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() + return port + + +@contextmanager +def ensure_safe_environment_variables(): + """ + Get a context manager to safely set environment variables + All changes will be undone on close, hence environment variables set + within this contextmanager will neither persist nor change global state. + """ + saved_environ = dict(os.environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(saved_environ) + + +def start_s3_server(ip_address, port): + server = ThreadedMotoServer(ip_address=ip_address, port=port) + server.start() + time.sleep(120) + print("ThreadedMotoServer shutting down because of timeout (120s)") + + +@pytest.fixture(scope="session") +def s3_base(endpoint_ip, endpoint_port): + """ + Fixture to set up moto server in separate process + """ + with ensure_safe_environment_variables(): + # Use fake aws credentials + os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" + os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" + os.environ["AWS_SECURITY_TOKEN"] = "foobar_security_token" + os.environ["AWS_SESSION_TOKEN"] = "foobar_session_token" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + os.environ["AWS_ENDPOINT_URL"] = f"http://{endpoint_ip}:{endpoint_port}" + + p = mp.Process(target=start_s3_server, args=(endpoint_ip, endpoint_port)) + p.start() + yield os.environ["AWS_ENDPOINT_URL"] + p.kill() + + +@contextmanager +def s3_context(s3_base, bucket, files=None): + if files is None: + files = {} + with ensure_safe_environment_variables(): + client = boto3.client("s3", endpoint_url=s3_base) + client.create_bucket(Bucket=bucket, ACL="public-read-write") + for f, data in files.items(): + client.put_object(Bucket=bucket, Key=f, Body=data) + yield + for f, data in files.items(): + try: + client.delete_object(Bucket=bucket, Key=f) + except Exception: + pass + + +def test_read(s3_base, xp): + bucket_name = "test_read" + object_name = "a1" + a = xp.arange(10_000_000) + with s3_context(s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}): + with kvikio.RemoteFile(bucket_name, object_name) as f: + assert f.nbytes() == a.nbytes + b = xp.empty_like(a) + assert f.read(buf=b) == a.nbytes + xp.testing.assert_array_equal(a, b) + + +@pytest.mark.parametrize( + "start,end", + [ + (0, 10 * 4096), + (1, int(1.3 * 4096)), + (int(2.1 * 4096), int(5.6 * 4096)), + (42, int(2**23)), + ], +) +def test_read_with_file_offset(s3_base, xp, start, end): + bucket_name = "test_read" + object_name = "a1" + a = xp.arange(end, dtype=xp.int64) + with s3_context(s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}): + with kvikio.RemoteFile(bucket_name, object_name) as f: + b = xp.zeros(shape=(end - start,), dtype=xp.int64) + assert f.read(b, file_offset=start * a.itemsize) == b.nbytes + xp.testing.assert_array_equal(a[start:end], b) + + with kvikio.RemoteFile.from_url(f"s3://{bucket_name}/{object_name}") as f: + b = xp.zeros(shape=(end - start,), dtype=xp.int64) + assert f.read(b, file_offset=start * a.itemsize) == b.nbytes + xp.testing.assert_array_equal(a[start:end], b) diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 3bdaf6613e..30cee08be9 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -8,6 +8,8 @@ import pytest +import kvikio + benchmarks_path = ( Path(os.path.realpath(__file__)).parent.parent / "kvikio" / "benchmarks" ) @@ -78,3 +80,47 @@ def test_zarr_io(run_cmd, tmp_path, api): cwd=benchmarks_path, ) assert retcode == 0 + + +@pytest.mark.parametrize( + "api", + [ + "cupy-kvikio", + "numpy-kvikio", + "cudf-kvikio", + "cudf-fsspec", + ], +) +def test_aws_s3_io(run_cmd, api): + """Test benchmarks/aws_s3_io.py""" + + if not kvikio.is_remote_file_available(): + pytest.skip( + "cannot test remote IO, please build KvikIO with with AWS S3 support", + allow_module_level=True, + ) + pytest.importorskip("boto3") + pytest.importorskip("moto") + if "cudf" in api: + pytest.importorskip("cudf") + + if api == "cudf-kvikio": + pytest.skip( + "Enable when has been merged" + ) + + retcode = run_cmd( + cmd=[ + sys.executable or "python", + "aws_s3_io.py", + "--use-bundled-server", + "-n", + "1000", + "-t", + "4", + "--api", + api, + ], + cwd=benchmarks_path, + ) + assert retcode == 0 From 4a5884ce6ff94e4dfde2c0bdf8a6de068bb7de66 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 08:46:55 +0200 Subject: [PATCH 02/88] benchmark: use cudf.set_option --- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index 3c9d32e6c7..58e6222d07 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -96,9 +96,11 @@ def run() -> float: yield run() -def run_cudf(args, use_kvikio_s3): +def run_cudf(args, use_kvikio_s3: bool): import cudf + cudf.set_option("native_s3_io", use_kvikio_s3) + # Upload data to S3 server create_client_and_bucket() data = cupy.random.rand(args.nelem).astype(args.dtype) @@ -107,7 +109,7 @@ def run_cudf(args, use_kvikio_s3): def run() -> float: t0 = time.perf_counter() - cudf.read_parquet(f"s3://{args.bucket}/data1", use_kvikio_s3=use_kvikio_s3) + cudf.read_parquet(f"s3://{args.bucket}/data1") t1 = time.perf_counter() return t1 - t0 From 804fd998a899042080212c7ec6644b3321c550fa Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 09:37:03 +0200 Subject: [PATCH 03/88] revert some minor changes --- cpp/include/kvikio/file_handle.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index cef5f4ed1f..80779c5282 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -144,7 +144,7 @@ class FileHandle { bool _initialized{false}; bool _compat_mode{false}; mutable std::size_t _nbytes{0}; // The size of the underlying file, zero means unknown. - CUfileHandle_t _handle{nullptr}; + CUfileHandle_t _handle{}; public: static constexpr mode_t m644 = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH; @@ -208,7 +208,7 @@ class FileHandle { _initialized{std::exchange(o._initialized, false)}, _compat_mode{std::exchange(o._compat_mode, false)}, _nbytes{std::exchange(o._nbytes, 0)}, - _handle{std::exchange(o._handle, CUfileHandle_t{nullptr})} + _handle{std::exchange(o._handle, CUfileHandle_t{})} { } FileHandle& operator=(FileHandle&& o) noexcept @@ -218,7 +218,7 @@ class FileHandle { _initialized = std::exchange(o._initialized, false); _compat_mode = std::exchange(o._compat_mode, false); _nbytes = std::exchange(o._nbytes, 0); - _handle = std::exchange(o._handle, CUfileHandle_t{nullptr}); + _handle = std::exchange(o._handle, CUfileHandle_t{}); return *this; } ~FileHandle() noexcept { close(); } @@ -232,8 +232,8 @@ class FileHandle { { if (closed()) { return; } - if (_handle != nullptr) { cuFileAPI::instance().HandleDeregister(_handle); } - if (_fd_direct_off != -1) { ::close(_fd_direct_off); } + if (!_compat_mode) { cuFileAPI::instance().HandleDeregister(_handle); } + ::close(_fd_direct_off); if (_fd_direct_on != -1) { ::close(_fd_direct_on); } _fd_direct_on = -1; _fd_direct_off = -1; From bbbe637f45c06e893ed9db30908b3317aed68dab Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 10:45:40 +0200 Subject: [PATCH 04/88] doc --- cpp/include/kvikio/remote_handle.hpp | 102 +++++++++++++++++++++---- python/kvikio/tests/test_benchmarks.py | 5 -- 2 files changed, 86 insertions(+), 21 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 78f21d3024..778df653e4 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -33,7 +33,6 @@ #include #include -using namespace std::chrono; namespace kvikio { namespace detail { @@ -49,27 +48,46 @@ class BufferAsStream : public Aws::IOStream { ~BufferAsStream() override = default; }; +/** + * @brief S3 context, which initialize and maintains the S3 API and client. + */ class S3Context { public: S3Context() : _client{S3Context::create_client()} {} + /** + * @brief Get a reference to the S3 client + * + * @return S3 client + */ Aws::S3::S3Client& client() { return *_client; } + /** + * @brief Get the default context, which is created on first call + * + * @return The default S3 context + */ static S3Context& default_context() { static S3Context _default_context; return _default_context; } + // No copy semantic S3Context(S3Context const&) = delete; void operator=(S3Context const&) = delete; private: + /** + * @brief Initialize the S3 API (idempotent) + * + * This private function is called as part of `S3Context` creation and it makes sure to only call + * `Aws::InitAPI()` once. + */ static void ensure_aws_s3_api_init() { static bool not_initalized{true}; if (not_initalized) { - std::cout << "ensure_aws_s3_api_initalized INIT" << std::endl; not_initalized = false; Aws::SDKOptions options; @@ -78,6 +96,11 @@ class S3Context { } } + /** + * @brief Create a new S3 client + * + * @return The new client + */ static std::shared_ptr create_client() { S3Context::ensure_aws_s3_api_init(); @@ -110,6 +133,13 @@ class S3Context { std::shared_ptr _client; }; +/** + * @brief Get the size of a S3 file + * + * @param bucket_name The bucket name. + * @param object_name The object name. + * @return Size of the file in bytes. + */ inline std::size_t get_s3_file_size(const std::string& bucket_name, const std::string& object_name) { KVIKIO_NVTX_FUNC_RANGE(); @@ -125,6 +155,12 @@ inline std::size_t get_s3_file_size(const std::string& bucket_name, const std::s return outcome.GetResult().GetContentLength(); } +/** + * @brief Given a file path like "s3:///", return the name of the bucket and object. + * + * @param path S3 file path. + * @return Pair of strings: [bucket-name, object-name]. + */ inline std::pair parse_s3_path(const std::string& path) { if (path.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } @@ -141,7 +177,7 @@ inline std::pair parse_s3_path(const std::string& path } // namespace detail /** - * @brief Handle of + * @brief Handle of remote file. * * At the moment, only AWS S3 is the supported */ @@ -152,40 +188,56 @@ class RemoteHandle { std::size_t _nbytes{}; public: + // Use of a default constructed instance is undefined behavior. RemoteHandle() noexcept = default; + /** + * @brief Construct from a bucket and object name. + * + * @param bucket_name Name of the bucket. + * @param object_name Name of the object. + */ RemoteHandle(std::string bucket_name, std::string object_name) : _bucket_name(std::move(bucket_name)), _object_name(std::move(object_name)), _nbytes(detail::get_s3_file_size(_bucket_name, _object_name)) { - std::cout << "RemoteHandle() - bucket_name: " << _bucket_name - << ", object_name: " << _object_name << ", nbytes: " << _nbytes << std::endl; } + /** + * @brief Construct from a remote path such as "s3:///". + * + * @param remote_path Remote file path. + */ RemoteHandle(const std::string& remote_path) { auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); _bucket_name = std::move(bucket_name); _object_name = std::move(object_name); _nbytes = detail::get_s3_file_size(_bucket_name, _object_name); - - std::cout << "RemoteHandle() - remote_path: " << remote_path - << ", bucket_name: " << _bucket_name << ", object_name: " << _object_name - << ", nbytes: " << _nbytes << std::endl; } /** - * @brief Get the file size + * @brief Get the file size. + * + * Note, this is very fast, no communication needed. * - * @return The number of bytes + * @return The number of bytes. */ [[nodiscard]] inline std::size_t nbytes() const { return _nbytes; } + /** + * @brief Read from remote source into host memory. + * + * @param buf Pointer to host memory. + * @param size Number of bytes to read. + * @param file_offset File offset in bytes. + * @return Number of bytes read. + */ std::size_t read_to_host(void* buf, std::size_t size, std::size_t file_offset = 0) { KVIKIO_NVTX_FUNC_RANGE("AWS S3 receive", size); - auto t0 = high_resolution_clock::now(); + auto t0 = std::chrono::high_resolution_clock::now(); auto& default_context = detail::S3Context::default_context(); Aws::S3::Model::GetObjectRequest req; @@ -210,15 +262,23 @@ class RemoteHandle { throw std::runtime_error("S3 read of " + std::to_string(size) + " bytes failed, received " + std::to_string(n) + " bytes"); } - auto t1 = high_resolution_clock::now(); - float duration = size / (duration_cast(t1 - t0).count() / 1000000.0); - + auto t1 = std::chrono::high_resolution_clock::now(); + float duration = + size / (std::chrono::duration_cast(t1 - t0).count() / 1000000.0); std::cout << "RemoteHandle::read_to_host() - buf: " << buf << ", size: " << size << ", file_offset: " << file_offset << ", bw: " << duration / (2 << 20) << " MiB/s" << std::endl; return n; } + /** + * @brief Read from remote source into buffer (host or device memory). + * + * @param buf Pointer to host or device memory. + * @param size Number of bytes to read. + * @param file_offset File offset in bytes. + * @return Number of bytes read, which is `size` always. + */ std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0) { KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::read()", size); @@ -247,10 +307,20 @@ class RemoteHandle { return size; } + /** + * @brief Read from remote source into buffer (host or device memory) in parallel. + * + * Contrary to `FileHandle::pread()`, a task size of 16 MiB is used always. + * See `kvikio::posix_bounce_buffer_size`. + * + * @param buf Pointer to host or device memory. + * @param size Number of bytes to read. + * @param file_offset File offset in bytes. + * @return Number of bytes read, which is `size` always. + */ std::future pread(void* buf, std::size_t size, std::size_t file_offset = 0) { KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::pread()", size); - std::cout << "RemoteHandle::pread()" << std::endl; auto task = [this](void* devPtr_base, std::size_t size, std::size_t file_offset, diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 30cee08be9..8ef4219ab1 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -104,11 +104,6 @@ def test_aws_s3_io(run_cmd, api): if "cudf" in api: pytest.importorskip("cudf") - if api == "cudf-kvikio": - pytest.skip( - "Enable when has been merged" - ) - retcode = run_cmd( cmd=[ sys.executable or "python", From a752eee288efd070ae637f3b2da405a5b60ff8fe Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 11:05:44 +0200 Subject: [PATCH 05/88] doc --- python/kvikio/kvikio/remote_file.py | 76 +++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 00c788b6b0..016678f471 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -28,15 +28,43 @@ def _get_remote_remote_file_class(): class RemoteFile: - """File handle of a remote file""" + """File handle of a remote file (only AWS S3 is the supported). + + Please make sure that AWS credentials have been configure on the system. + A common way to do this, is to define the set the environment variables: + `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + + Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see + . + """ def __init__(self, bucket_name: str, object_name: str): + """Open a remote file given a bucket and object name. + + Parameters + ---------- + bucket_name + Name of the bucket. + object_name + Name of the object. + """ self._handle = _get_remote_remote_file_class().from_bucket_and_object( bucket_name, object_name ) @classmethod def from_url(cls, url: str) -> RemoteFile: + """Open a remote file given an url such as "s3:///". + + Parameters + ---------- + url + URL to the remote file. + + Returns + ------- + A newly opened remote file + """ ret = object.__new__(cls) ret._handle = _get_remote_remote_file_class().from_url(url) return ret @@ -48,10 +76,50 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None: pass def nbytes(self) -> int: - return self._handle.nbytes() + """Get the file size. - def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: - return IOFuture(self._handle.pread(buf, size, file_offset)) + Note, this is very fast, no communication needed. + + Returns + ------- + The number of bytes. + """ + return self._handle.nbytes() def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: + """Read from remote source into buffer (host or device memory) in parallel. + + Parameters + ---------- + buf: buffer-like or array-like + Device or host buffer to read into. + size + Size in bytes to read. + file_offset + Offset in the file to read from. + + Returns + ------- + The size of bytes that were successfully read. + """ return self.pread(buf, size, file_offset).get() + + def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: + """Read from remote source into buffer (host or device memory) in parallel. + + Parameters + ---------- + buf: buffer-like or array-like + Device or host buffer to read into. + size + Size in bytes to read. + file_offset + Offset in the file to read from. + + Returns + ------- + IOFuture + Future that on completion returns the size of bytes that were successfully + read. + """ + return IOFuture(self._handle.pread(buf, size, file_offset)) From ce4218a36daf4587bf4ceabaf9a1425bb2418dd6 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 11:13:16 +0200 Subject: [PATCH 06/88] doc --- cpp/include/kvikio/remote_handle.hpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 778df653e4..fa1edeb25b 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -38,7 +38,7 @@ namespace kvikio { namespace detail { /** - * Stream implementation of a fixed size buffer + * Stream implementation of a fixed size buffer. */ class BufferAsStream : public Aws::IOStream { public: @@ -56,16 +56,16 @@ class S3Context { S3Context() : _client{S3Context::create_client()} {} /** - * @brief Get a reference to the S3 client + * @brief Get a reference to the S3 client. * - * @return S3 client + * @return S3 client. */ Aws::S3::S3Client& client() { return *_client; } /** - * @brief Get the default context, which is created on first call + * @brief Get the default context, which is created on first call. * - * @return The default S3 context + * @return The default S3 context. */ static S3Context& default_context() { @@ -177,9 +177,14 @@ inline std::pair parse_s3_path(const std::string& path } // namespace detail /** - * @brief Handle of remote file. + * @brief Handle of remote file (only AWS S3 is the supported). * - * At the moment, only AWS S3 is the supported + * Please make sure that AWS credentials have been configure on the system. + * A common way to do this, is to define the set the environment variables: + * `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + * + * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see + * . */ class RemoteHandle { private: From 5ba191d9de524411df68571136570704d9162373 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 11:17:44 +0200 Subject: [PATCH 07/88] doc --- cpp/include/kvikio/remote_handle.hpp | 2 +- python/kvikio/kvikio/remote_file.py | 2 +- python/kvikio/tests/test_aws_s3.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index fa1edeb25b..e479f5d554 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -177,7 +177,7 @@ inline std::pair parse_s3_path(const std::string& path } // namespace detail /** - * @brief Handle of remote file (only AWS S3 is the supported). + * @brief Handle of remote file (currently, only AWS S3 is supported). * * Please make sure that AWS credentials have been configure on the system. * A common way to do this, is to define the set the environment variables: diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 016678f471..81eb8806f9 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -28,7 +28,7 @@ def _get_remote_remote_file_class(): class RemoteFile: - """File handle of a remote file (only AWS S3 is the supported). + """File handle of a remote file (currently, only AWS S3 is supported). Please make sure that AWS credentials have been configure on the system. A common way to do this, is to define the set the environment variables: diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 04e0d08c0a..d6690173c1 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -59,8 +59,8 @@ def ensure_safe_environment_variables(): def start_s3_server(ip_address, port): server = ThreadedMotoServer(ip_address=ip_address, port=port) server.start() - time.sleep(120) - print("ThreadedMotoServer shutting down because of timeout (120s)") + time.sleep(60) + print("ThreadedMotoServer shutting down because of timeout (60s)") @pytest.fixture(scope="session") From 5126b44c8c165b5ebefcb645e20fb48f2f732170 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 11:26:51 +0200 Subject: [PATCH 08/88] --bundled-server-lifetime --- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 15 +++++++++++---- python/kvikio/tests/test_aws_s3.py | 4 ++-- python/kvikio/tests/test_benchmarks.py | 2 ++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index 58e6222d07..cf2194d8a7 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -31,7 +31,7 @@ def get_local_port() -> int: return port -def start_s3_server(lifetime=3600): +def start_s3_server(lifetime: int): from moto.server import ThreadedMotoServer # Silence the activity info from ThreadedMotoServer @@ -43,12 +43,12 @@ def start_s3_server(lifetime=3600): @contextlib.contextmanager -def local_s3_server(): +def local_s3_server(lifetime: int): # Use fake aws credentials os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" os.environ["AWS_DEFAULT_REGION"] = "us-east-1" - p = multiprocessing.Process(target=start_s3_server) + p = multiprocessing.Process(target=start_s3_server, args=(lifetime,)) p.start() yield p.kill() @@ -204,6 +204,13 @@ def pprint_api_res(name, samples): action="store_true", help="Launch and use a local slow S3 server (ThreadedMotoServer).", ) + parser.add_argument( + "--bundled-server-lifetime", + metavar="SECONDS", + default=3600, + type=int, + help="Maximum lifetime of the bundled server (default: %(default)s).", + ) parser.add_argument( "--bucket", metavar="NAME", @@ -227,6 +234,6 @@ def pprint_api_res(name, samples): ctx: ContextManager = contextlib.nullcontext() if args.use_bundled_server: os.environ["AWS_ENDPOINT_URL"] = f"http://127.0.0.1:{get_local_port()}" - ctx = local_s3_server() + ctx = local_s3_server(args.bundled_server_lifetime) with ctx: main(args) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index d6690173c1..04e0d08c0a 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -59,8 +59,8 @@ def ensure_safe_environment_variables(): def start_s3_server(ip_address, port): server = ThreadedMotoServer(ip_address=ip_address, port=port) server.start() - time.sleep(60) - print("ThreadedMotoServer shutting down because of timeout (60s)") + time.sleep(120) + print("ThreadedMotoServer shutting down because of timeout (120s)") @pytest.fixture(scope="session") diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 8ef4219ab1..f2c626cac0 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -109,6 +109,8 @@ def test_aws_s3_io(run_cmd, api): sys.executable or "python", "aws_s3_io.py", "--use-bundled-server", + "--bundled-server-lifetime", + "30", "-n", "1000", "-t", From 185e687bc60539289b7b7922654e63af09877d6f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 11:35:03 +0200 Subject: [PATCH 09/88] clean up --- cpp/include/kvikio/file_handle.hpp | 2 +- cpp/include/kvikio/remote_handle.hpp | 2 +- python/kvikio/tests/test_aws_s3.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index 80779c5282..68953f5798 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -286,7 +286,7 @@ class FileHandle { * * @return The number of bytes */ - [[nodiscard]] inline std::size_t nbytes() const + [[nodiscard]] std::size_t nbytes() const { if (closed()) { return 0; } if (_nbytes == 0) { _nbytes = detail::get_file_size(_fd_direct_off); } diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index e479f5d554..40577164d7 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -229,7 +229,7 @@ class RemoteHandle { * * @return The number of bytes. */ - [[nodiscard]] inline std::size_t nbytes() const { return _nbytes; } + [[nodiscard]] std::size_t nbytes() const { return _nbytes; } /** * @brief Read from remote source into host memory. diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 04e0d08c0a..51844c32cb 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -59,8 +59,8 @@ def ensure_safe_environment_variables(): def start_s3_server(ip_address, port): server = ThreadedMotoServer(ip_address=ip_address, port=port) server.start() - time.sleep(120) - print("ThreadedMotoServer shutting down because of timeout (120s)") + time.sleep(180) + print("ThreadedMotoServer shutting down because of timeout (180s)") @pytest.fixture(scope="session") From 502f7cf55ce6ab5fb2ea59652d89419b373260e6 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 13:00:34 +0200 Subject: [PATCH 10/88] doc --- cpp/include/kvikio/remote_handle.hpp | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 40577164d7..a12e1c1978 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -105,29 +105,18 @@ class S3Context { { S3Context::ensure_aws_s3_api_init(); + // Read AWS_ENDPOINT_URL to overwrite endpoint Aws::Client::ClientConfiguration clientConfig; - // Optional: Set to the AWS Region (overrides config file). - // clientConfig.region = "us-east-1"; + const char* ep = getenv("AWS_ENDPOINT_URL"); + if (ep != nullptr) { clientConfig.endpointOverride = ep; } - const char* endpointOverride = getenv("AWS_ENDPOINT_URL"); - if (endpointOverride != nullptr) { clientConfig.endpointOverride = endpointOverride; } - - // You don't normally have to test that you are authenticated. But the S3 service permits - // anonymous requests, thus the s3Client will return "success" even if you are - // unauthenticated, which can be confusing to a new user. - auto provider = Aws::MakeShared("alloc-tag"); + // We check authentication here to trigger an early exception. + auto provider = Aws::MakeShared("check-creds"); auto creds = provider->GetAWSCredentials(); if (creds.IsEmpty()) { - throw std::runtime_error(std::string("Failed authentication to ") + endpointOverride); - } - auto ret = std::make_shared(Aws::S3::S3Client(clientConfig)); - - // Try the connection - auto outcome = ret->ListBuckets(); - if (!outcome.IsSuccess()) { - throw std::runtime_error(std::string("S3 error: ") + outcome.GetError().GetMessage()); + throw std::runtime_error(std::string("Failed authentication to ") + ep); } - return ret; + return std::make_shared(Aws::S3::S3Client(clientConfig)); } std::shared_ptr _client; From 740a15db5ca332f64eeab47f25edf960d1808507 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 13:06:19 +0200 Subject: [PATCH 11/88] cleanup --- cpp/include/kvikio/remote_handle.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index a12e1c1978..c5f801f8a8 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -256,11 +256,10 @@ class RemoteHandle { throw std::runtime_error("S3 read of " + std::to_string(size) + " bytes failed, received " + std::to_string(n) + " bytes"); } - auto t1 = std::chrono::high_resolution_clock::now(); - float duration = - size / (std::chrono::duration_cast(t1 - t0).count() / 1000000.0); + auto t1 = std::chrono::high_resolution_clock::now(); + float dur = std::chrono::duration_cast(t1 - t0).count() / 1000000.0; std::cout << "RemoteHandle::read_to_host() - buf: " << buf << ", size: " << size - << ", file_offset: " << file_offset << ", bw: " << duration / (2 << 20) << " MiB/s" + << ", file_offset: " << file_offset << ", bw: " << size / dur / (2 << 20) << " MiB/s" << std::endl; return n; } From 7fbabdfaeef062d6adfb5e44d5702c2b8f504889 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 13:27:24 +0200 Subject: [PATCH 12/88] doc --- python/kvikio/kvikio/cufile.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/python/kvikio/kvikio/cufile.py b/python/kvikio/kvikio/cufile.py index ead7bc5f7a..dfe4c0fe05 100644 --- a/python/kvikio/kvikio/cufile.py +++ b/python/kvikio/kvikio/cufile.py @@ -140,9 +140,8 @@ def pread( Returns ------- - IOFuture - Future that on completion returns the size of bytes that were successfully - read. + Future that on completion returns the size of bytes that were successfully + read. Notes ----- @@ -187,9 +186,8 @@ def pwrite( Returns ------- - IOFuture - Future that on completion returns the size of bytes that were successfully - written. + Future that on completion returns the size of bytes that were successfully + written. Notes ----- @@ -307,12 +305,11 @@ def raw_read_async( Returns ------- - IOFutureStream - Future that when executed ".check_bytes_done()" returns the size of bytes - that were successfully read. The instance must be kept alive until - all data has been read from disk. One way to do this, is by calling - `IOFutureStream.check_bytes_done()`, which will synchronize the associated - stream and return the number of bytes read. + Future that when executed ".check_bytes_done()" returns the size of bytes + that were successfully read. The instance must be kept alive until + all data has been read from disk. One way to do this, is by calling + `IOFutureStream.check_bytes_done()`, which will synchronize the associated + stream and return the number of bytes read. """ return self._handle.read_async(buf, size, file_offset, dev_offset, stream) @@ -342,12 +339,11 @@ def raw_write_async( Returns ------- - IOFutureStream - Future that when executed ".check_bytes_done()" returns the size of bytes - that were successfully written. The instance must be kept alive until - all data has been written to disk. One way to do this, is by calling - `IOFutureStream.check_bytes_done()`, which will synchronize the associated - stream and return the number of bytes written. + Future that when executed ".check_bytes_done()" returns the size of bytes + that were successfully written. The instance must be kept alive until + all data has been written to disk. One way to do this, is by calling + `IOFutureStream.check_bytes_done()`, which will synchronize the associated + stream and return the number of bytes written. """ return self._handle.write_async(buf, size, file_offset, dev_offset, stream) From 5b41e74fc3fe35f993f1f1b075753dc9167ac833 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 13 Aug 2024 13:29:19 +0200 Subject: [PATCH 13/88] doc --- python/kvikio/kvikio/remote_file.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 81eb8806f9..4775497cf4 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -118,8 +118,7 @@ def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFutu Returns ------- - IOFuture - Future that on completion returns the size of bytes that were successfully - read. + Future that on completion returns the size of bytes that were successfully + read. """ return IOFuture(self._handle.pread(buf, size, file_offset)) From 555903d27ee18e8811a89eaf7992718ac59f066a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 16 Aug 2024 08:49:35 +0200 Subject: [PATCH 14/88] doc --- cpp/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 646e676297..c489b29088 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -55,6 +55,8 @@ rapids_find_package( INSTALL_EXPORT_SET kvikio-exports ) +# If AWSSDK isn't found, the Cython module remote_handle.pyx isn't built and C++ users shouldn't +# include rapids_find_package( AWSSDK COMPONENTS s3 BUILD_EXPORT_SET kvikio-exports From 69f7fd9f3242418aae74be1199a9db31a8ebbf27 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 16 Aug 2024 09:33:22 -0400 Subject: [PATCH 15/88] Re-run CI From 39ca408524b9fe8fc4afce30c0adbb35d4734b53 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 16 Aug 2024 15:03:07 -0400 Subject: [PATCH 16/88] Fix CMake code --- cpp/CMakeLists.txt | 15 ++++++++++++++- python/kvikio/kvikio/_lib/CMakeLists.txt | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c489b29088..5a6f8215d2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -63,6 +63,17 @@ rapids_find_package( INSTALL_EXPORT_SET kvikio-exports ) +if(AWSSDK_FOUND) + get_property(_lib_type TARGET aws-cpp-sdk-s3 PROPERTY TYPE) + if(_lib_type STREQUAL "STATIC_LIBRARY") + rapids_find_package( + ZLIB + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports + ) + endif() +endif() + rapids_find_package( cuFile BUILD_EXPORT_SET kvikio-exports @@ -138,8 +149,10 @@ target_include_directories( ) target_link_libraries( kvikio INTERFACE Threads::Threads ${CMAKE_DL_LIBS} nvtx3::nvtx3-cpp BS::thread_pool - ${AWSSDK_LINK_LIBRARIES} ) +if(AWSSDK_FOUND) + target_link_libraries(kvikio INTERFACE aws-cpp-sdk-s3) +endif() target_compile_features(kvikio INTERFACE cxx_std_17) # optionally build examples diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index 2eec2d2668..9237e37116 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -27,5 +27,5 @@ endif() rapids_cython_create_modules( CXX SOURCE_FILES "${cython_modules}" - LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp ${AWSSDK_LINK_LIBRARIES} + LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp ) From 1c601ad407113da9d169d9c5fc7ca3fda7ffe7e0 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 16 Aug 2024 16:19:57 -0400 Subject: [PATCH 17/88] Style --- cpp/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5a6f8215d2..620a85e8a8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -64,7 +64,11 @@ rapids_find_package( ) if(AWSSDK_FOUND) - get_property(_lib_type TARGET aws-cpp-sdk-s3 PROPERTY TYPE) + get_property( + _lib_type + TARGET aws-cpp-sdk-s3 + PROPERTY TYPE + ) if(_lib_type STREQUAL "STATIC_LIBRARY") rapids_find_package( ZLIB From 551fe20bec323a90a8628720fb634a11ead6688a Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 16 Aug 2024 17:06:30 -0400 Subject: [PATCH 18/88] Install moto server support for testing --- dependencies.yaml | 5 +++++ python/kvikio/pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dependencies.yaml b/dependencies.yaml index 2dfd954af1..4302d0eaeb 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -321,6 +321,11 @@ dependencies: - pytest - pytest-cov - boto3>=1.21.21 + - output_types: [requirements, pyproject] + packages: + - moto[server]>=4.0.8 + - output_types: conda + packages: - moto>=4.0.8 specific: - output_types: [conda, requirements, pyproject] diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index 353019149e..e10c63b974 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -41,7 +41,7 @@ test = [ "boto3>=1.21.21", "cuda-python>=11.7.1,<12.0a0", "dask>=2022.05.2", - "moto>=4.0.8", + "moto[server]>=4.0.8", "pytest", "pytest-cov", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From b0ff3d1c9e1b84ddf233757722216d81ce275d0d Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 16 Aug 2024 17:36:11 -0400 Subject: [PATCH 19/88] Install cudf for testing --- conda/environments/all_cuda-118_arch-aarch64.yaml | 1 + conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-125_arch-aarch64.yaml | 1 + conda/environments/all_cuda-125_arch-x86_64.yaml | 1 + dependencies.yaml | 14 ++++++++++++++ python/kvikio/pyproject.toml | 1 + 6 files changed, 19 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 82790a443d..6cb0d295d7 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -13,6 +13,7 @@ dependencies: - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit +- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cce77c120f..69f46f9ab7 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -13,6 +13,7 @@ dependencies: - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit +- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 6500ccfea3..9077c752c8 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -13,6 +13,7 @@ dependencies: - cuda-nvcc - cuda-python>=12.0,<13.0a0 - cuda-version=12.5 +- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index f5942cc6f6..1bec28dc21 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -13,6 +13,7 @@ dependencies: - cuda-nvcc - cuda-python>=12.0,<13.0a0 - cuda-version=12.5 +- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/dependencies.yaml b/dependencies.yaml index 4302d0eaeb..9a02142f9b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -326,6 +326,7 @@ dependencies: - moto[server]>=4.0.8 - output_types: conda packages: + - cudf==24.10.*,>=0.0.0a0 - moto>=4.0.8 specific: - output_types: [conda, requirements, pyproject] @@ -337,3 +338,16 @@ dependencies: - matrix: # All CUDA 11 versions packages: - cuda-python>=11.7.1,<12.0a0 + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - cudf-cu12==24.10.*,>=0.0.0a0 + - matrix: + cuda: "11.*" + packages: + - cudf-cu11==24.10.*,>=0.0.0a0 + - matrix: + packages: + - cudf==24.10.*,>=0.0.0a0 diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index e10c63b974..d684f05f11 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -40,6 +40,7 @@ classifiers = [ test = [ "boto3>=1.21.21", "cuda-python>=11.7.1,<12.0a0", + "cudf==24.10.*,>=0.0.0a0", "dask>=2022.05.2", "moto[server]>=4.0.8", "pytest", From 06c0311643cfcb631c7a7a3ee2805e863de6f51d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 19 Aug 2024 09:54:10 +0200 Subject: [PATCH 20/88] doc --- cpp/include/kvikio/remote_handle.hpp | 2 +- python/kvikio/kvikio/remote_file.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index c5f801f8a8..df4c0dc506 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -169,7 +169,7 @@ inline std::pair parse_s3_path(const std::string& path * @brief Handle of remote file (currently, only AWS S3 is supported). * * Please make sure that AWS credentials have been configure on the system. - * A common way to do this, is to define the set the environment variables: + * A common way to do this, is to set the environment variables: * `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. * * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 4775497cf4..bdc1d1f089 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -31,7 +31,7 @@ class RemoteFile: """File handle of a remote file (currently, only AWS S3 is supported). Please make sure that AWS credentials have been configure on the system. - A common way to do this, is to define the set the environment variables: + A common way to do this, is to set the environment variables: `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see From 504f15dd5999ca718d72c189ab6e8085a10e694c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Mon, 19 Aug 2024 10:21:20 -0400 Subject: [PATCH 21/88] Temporarily remove cudf test dependency --- conda/environments/all_cuda-118_arch-aarch64.yaml | 1 - conda/environments/all_cuda-118_arch-x86_64.yaml | 1 - conda/environments/all_cuda-125_arch-aarch64.yaml | 1 - conda/environments/all_cuda-125_arch-x86_64.yaml | 1 - dependencies.yaml | 12 ++++++++---- python/kvikio/pyproject.toml | 1 - 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 6cb0d295d7..82790a443d 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -13,7 +13,6 @@ dependencies: - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit -- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 69f46f9ab7..cce77c120f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -13,7 +13,6 @@ dependencies: - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 - cudatoolkit -- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 9077c752c8..6500ccfea3 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -13,7 +13,6 @@ dependencies: - cuda-nvcc - cuda-python>=12.0,<13.0a0 - cuda-version=12.5 -- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 1bec28dc21..f5942cc6f6 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -13,7 +13,6 @@ dependencies: - cuda-nvcc - cuda-python>=12.0,<13.0a0 - cuda-version=12.5 -- cudf==24.10.*,>=0.0.0a0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 diff --git a/dependencies.yaml b/dependencies.yaml index 9a02142f9b..fbd70cd185 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -326,7 +326,8 @@ dependencies: - moto[server]>=4.0.8 - output_types: conda packages: - - cudf==24.10.*,>=0.0.0a0 + # TODO: Uncomment once https://github.com/rapidsai/cudf/pull/16499 is merged + #- cudf==24.10.*,>=0.0.0a0 - moto>=4.0.8 specific: - output_types: [conda, requirements, pyproject] @@ -343,11 +344,14 @@ dependencies: - matrix: cuda: "12.*" packages: - - cudf-cu12==24.10.*,>=0.0.0a0 + # TODO: Uncomment once https://github.com/rapidsai/cudf/pull/16499 is merged + #- cudf-cu12==24.10.*,>=0.0.0a0 - matrix: cuda: "11.*" packages: - - cudf-cu11==24.10.*,>=0.0.0a0 + # TODO: Uncomment once https://github.com/rapidsai/cudf/pull/16499 is merged + #- cudf-cu11==24.10.*,>=0.0.0a0 - matrix: packages: - - cudf==24.10.*,>=0.0.0a0 + # TODO: Uncomment once https://github.com/rapidsai/cudf/pull/16499 is merged + #- cudf==24.10.*,>=0.0.0a0 diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index d684f05f11..e10c63b974 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -40,7 +40,6 @@ classifiers = [ test = [ "boto3>=1.21.21", "cuda-python>=11.7.1,<12.0a0", - "cudf==24.10.*,>=0.0.0a0", "dask>=2022.05.2", "moto[server]>=4.0.8", "pytest", From 5f07f2edf269c47740945e7e4d4d990191dad988 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 08:52:20 +0200 Subject: [PATCH 22/88] test: import boto3 and moto --- python/kvikio/tests/test_aws_s3.py | 10 +++++----- python/kvikio/tests/test_benchmarks.py | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 51844c32cb..8bfa706cc8 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -14,16 +14,16 @@ # TODO: remove before PR merge. Trigger CI error if the remote module wasn't built import kvikio._lib.remote_handle # isort: skip -moto = pytest.importorskip("moto", minversion="3.1.6") -boto3 = pytest.importorskip("boto3") - if not kvikio.is_remote_file_available(): pytest.skip( "cannot test remote IO, please build KvikIO with with AWS S3 support", allow_module_level=True, ) -ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer +# Notice, we import boto and moto after the `is_remote_file_available` check. +import boto3 +import moto +import moto.server @pytest.fixture(scope="session") @@ -57,7 +57,7 @@ def ensure_safe_environment_variables(): def start_s3_server(ip_address, port): - server = ThreadedMotoServer(ip_address=ip_address, port=port) + server = moto.server.ThreadedMotoServer(ip_address=ip_address, port=port) server.start() time.sleep(180) print("ThreadedMotoServer shutting down because of timeout (180s)") diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index f2c626cac0..ea35b7793e 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -96,11 +96,13 @@ def test_aws_s3_io(run_cmd, api): if not kvikio.is_remote_file_available(): pytest.skip( - "cannot test remote IO, please build KvikIO with with AWS S3 support", - allow_module_level=True, + "cannot test remote IO, please build KvikIO with with AWS S3 support" ) - pytest.importorskip("boto3") - pytest.importorskip("moto") + # Fail early if benchmark dependencies isn't available + import boto3 # noqa: F401 + import moto # noqa: F401 + + # TODO: change to import once https://github.com/rapidsai/cudf/pull/16499 is merged if "cudf" in api: pytest.importorskip("cudf") From f391ad1cee2675b5dc80c675cd298064412c3ab9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 09:01:10 +0200 Subject: [PATCH 23/88] cleanup --- cpp/include/kvikio/remote_handle.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index df4c0dc506..6320e1656b 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -32,8 +32,6 @@ #include #include -#include - namespace kvikio { namespace detail { @@ -231,7 +229,6 @@ class RemoteHandle { std::size_t read_to_host(void* buf, std::size_t size, std::size_t file_offset = 0) { KVIKIO_NVTX_FUNC_RANGE("AWS S3 receive", size); - auto t0 = std::chrono::high_resolution_clock::now(); auto& default_context = detail::S3Context::default_context(); Aws::S3::Model::GetObjectRequest req; @@ -256,11 +253,6 @@ class RemoteHandle { throw std::runtime_error("S3 read of " + std::to_string(size) + " bytes failed, received " + std::to_string(n) + " bytes"); } - auto t1 = std::chrono::high_resolution_clock::now(); - float dur = std::chrono::duration_cast(t1 - t0).count() / 1000000.0; - std::cout << "RemoteHandle::read_to_host() - buf: " << buf << ", size: " << size - << ", file_offset: " << file_offset << ", bw: " << size / dur / (2 << 20) << " MiB/s" - << std::endl; return n; } From 247ab4d725bbd4a605fb11e4be61e3ebaf257552 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 14:58:20 +0200 Subject: [PATCH 24/88] clean up --- cpp/include/kvikio/remote_handle.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 6320e1656b..bb8d128dac 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -109,9 +109,8 @@ class S3Context { if (ep != nullptr) { clientConfig.endpointOverride = ep; } // We check authentication here to trigger an early exception. - auto provider = Aws::MakeShared("check-creds"); - auto creds = provider->GetAWSCredentials(); - if (creds.IsEmpty()) { + Aws::Auth::DefaultAWSCredentialsProviderChain provider; + if (provider.GetAWSCredentials().IsEmpty()) { throw std::runtime_error(std::string("Failed authentication to ") + ep); } return std::make_shared(Aws::S3::S3Client(clientConfig)); From 9adffc1b396a495f2197b3b5fce5432f1564f6d9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 15:09:33 +0200 Subject: [PATCH 25/88] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/remote_handle.hpp | 7 +++---- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 2 +- python/kvikio/kvikio/remote_file.py | 2 +- python/kvikio/tests/test_aws_s3.py | 9 ++++----- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 6320e1656b..cf4d2559a9 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -47,7 +47,7 @@ class BufferAsStream : public Aws::IOStream { }; /** - * @brief S3 context, which initialize and maintains the S3 API and client. + * @brief S3 context, which initializes and maintains the S3 API and client. */ class S3Context { public: @@ -105,7 +105,7 @@ class S3Context { // Read AWS_ENDPOINT_URL to overwrite endpoint Aws::Client::ClientConfiguration clientConfig; - const char* ep = getenv("AWS_ENDPOINT_URL"); + const char* ep = std::getenv("AWS_ENDPOINT_URL"); if (ep != nullptr) { clientConfig.endpointOverride = ep; } // We check authentication here to trigger an early exception. @@ -281,8 +281,7 @@ class RemoteHandle { while (byte_remaining > 0) { const std::size_t nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); - std::size_t nbytes_got = nbytes_requested; - nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); + std::size_t nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream)); CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); cur_file_offset += nbytes_got; diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index cf2194d8a7..989e482d58 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -141,7 +141,7 @@ def main(args): print("--------------------------------------") print("Using the bundled local server is slow") print("and can be misleading. Consider using") - print("a local MinIO or officel S3 server.") + print("a local MinIO or official S3 server.") print("======================================") # Run each benchmark using the requested APIs diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index bdc1d1f089..cbda20d8e5 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -30,7 +30,7 @@ def _get_remote_remote_file_class(): class RemoteFile: """File handle of a remote file (currently, only AWS S3 is supported). - Please make sure that AWS credentials have been configure on the system. + Please make sure that AWS credentials have been configured on the system. A common way to do this, is to set the environment variables: `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 8bfa706cc8..d0d5baea3b 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -14,11 +14,10 @@ # TODO: remove before PR merge. Trigger CI error if the remote module wasn't built import kvikio._lib.remote_handle # isort: skip -if not kvikio.is_remote_file_available(): - pytest.skip( - "cannot test remote IO, please build KvikIO with with AWS S3 support", - allow_module_level=True, - ) +pytestmark = pytest.mark.skipif( + not kvikio.is_remote_file_available(), + reason="cannot test remote IO, please build KvikIO with with AWS S3 support", +) # Notice, we import boto and moto after the `is_remote_file_available` check. import boto3 From 8b4436ddc7ac05e16d897322fb667c6bed8c16c2 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 15:10:07 +0200 Subject: [PATCH 26/88] client(): not shared pointer --- cpp/include/kvikio/remote_handle.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index bb8d128dac..14b1bcbc41 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -58,7 +58,7 @@ class S3Context { * * @return S3 client. */ - Aws::S3::S3Client& client() { return *_client; } + Aws::S3::S3Client& client() { return _client; } /** * @brief Get the default context, which is created on first call. @@ -99,7 +99,7 @@ class S3Context { * * @return The new client */ - static std::shared_ptr create_client() + static Aws::S3::S3Client create_client() { S3Context::ensure_aws_s3_api_init(); @@ -113,10 +113,10 @@ class S3Context { if (provider.GetAWSCredentials().IsEmpty()) { throw std::runtime_error(std::string("Failed authentication to ") + ep); } - return std::make_shared(Aws::S3::S3Client(clientConfig)); + return Aws::S3::S3Client(Aws::S3::S3Client(clientConfig)); } - std::shared_ptr _client; + Aws::S3::S3Client _client; }; /** From e830f62dc9d4a22982a5db999be83e327d644861 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 15:14:18 +0200 Subject: [PATCH 27/88] style --- cpp/include/kvikio/remote_handle.hpp | 2 +- python/kvikio/tests/test_aws_s3.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 20f06f21c0..4ab8edc320 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -280,7 +280,7 @@ class RemoteHandle { while (byte_remaining > 0) { const std::size_t nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); - std::size_t nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); + std::size_t nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream)); CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); cur_file_offset += nbytes_got; diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index d0d5baea3b..64e139da90 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -15,14 +15,14 @@ import kvikio._lib.remote_handle # isort: skip pytestmark = pytest.mark.skipif( - not kvikio.is_remote_file_available(), + not kvikio.is_remote_file_available(), reason="cannot test remote IO, please build KvikIO with with AWS S3 support", ) # Notice, we import boto and moto after the `is_remote_file_available` check. -import boto3 -import moto -import moto.server +import boto3 # noqa: E402 +import moto # noqa: E402 +import moto.server # noqa: E402 @pytest.fixture(scope="session") From bff52a43e76a91909b6fb60d147c78fcbc009a32 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 16:50:58 +0200 Subject: [PATCH 28/88] S3Context::get_file_size --- cpp/include/kvikio/remote_handle.hpp | 48 ++++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 4ab8edc320..5f3a351fec 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -75,6 +75,28 @@ class S3Context { S3Context(S3Context const&) = delete; void operator=(S3Context const&) = delete; + /** + * @brief Get the size of a S3 file + * + * @param bucket_name The bucket name. + * @param object_name The object name. + * @return Size of the file in bytes. + */ + std::size_t get_file_size(const std::string& bucket_name, const std::string& object_name) + { + KVIKIO_NVTX_FUNC_RANGE(); + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(bucket_name.c_str()); + req.SetKey(object_name.c_str()); + Aws::S3::Model::HeadObjectOutcome outcome = client().HeadObject(req); + if (!outcome.IsSuccess()) { + const Aws::S3::S3Error& err = outcome.GetError(); + throw std::invalid_argument("get_file_size(): " + err.GetExceptionName() + ": " + + err.GetMessage()); + } + return outcome.GetResult().GetContentLength(); + } + private: /** * @brief Initialize the S3 API (idempotent) @@ -119,28 +141,6 @@ class S3Context { Aws::S3::S3Client _client; }; -/** - * @brief Get the size of a S3 file - * - * @param bucket_name The bucket name. - * @param object_name The object name. - * @return Size of the file in bytes. - */ -inline std::size_t get_s3_file_size(const std::string& bucket_name, const std::string& object_name) -{ - KVIKIO_NVTX_FUNC_RANGE(); - Aws::S3::Model::HeadObjectRequest req; - req.SetBucket(bucket_name.c_str()); - req.SetKey(object_name.c_str()); - Aws::S3::Model::HeadObjectOutcome outcome = S3Context::default_context().client().HeadObject(req); - if (!outcome.IsSuccess()) { - const Aws::S3::S3Error& err = outcome.GetError(); - throw std::invalid_argument("get_s3_file_size(): " + err.GetExceptionName() + ": " + - err.GetMessage()); - } - return outcome.GetResult().GetContentLength(); -} - /** * @brief Given a file path like "s3:///", return the name of the bucket and object. * @@ -191,7 +191,7 @@ class RemoteHandle { RemoteHandle(std::string bucket_name, std::string object_name) : _bucket_name(std::move(bucket_name)), _object_name(std::move(object_name)), - _nbytes(detail::get_s3_file_size(_bucket_name, _object_name)) + _nbytes(detail::S3Context::default_context().get_file_size(_bucket_name, _object_name)) { } @@ -205,7 +205,7 @@ class RemoteHandle { auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); _bucket_name = std::move(bucket_name); _object_name = std::move(object_name); - _nbytes = detail::get_s3_file_size(_bucket_name, _object_name); + _nbytes = detail::S3Context::default_context().get_file_size(_bucket_name, _object_name); } /** From f6e7ebc14e7f4fd8a3a664457f8504801f00965e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 20 Aug 2024 16:53:45 +0200 Subject: [PATCH 29/88] move S3Context out of detail --- cpp/include/kvikio/remote_handle.hpp | 48 ++++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 5f3a351fec..919e861374 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -46,6 +46,27 @@ class BufferAsStream : public Aws::IOStream { ~BufferAsStream() override = default; }; +/** + * @brief Given a file path like "s3:///", return the name of the bucket and object. + * + * @param path S3 file path. + * @return Pair of strings: [bucket-name, object-name]. + */ +inline std::pair parse_s3_path(const std::string& path) +{ + if (path.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } + if (path.size() < 5 || path.substr(0, 5) != "s3://") { + throw std::invalid_argument("The remote path must start with the S3 scheme (\"s3://\")."); + } + std::string p = path.substr(5); + if (p.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } + size_t pos = p.find_first_of('/'); + if (pos == 0) { throw std::invalid_argument("The remote path does not contain a bucket name."); } + return std::make_pair(p.substr(0, pos), (pos == std::string::npos) ? "" : p.substr(pos + 1)); +} + +} // namespace detail + /** * @brief S3 context, which initializes and maintains the S3 API and client. */ @@ -141,27 +162,6 @@ class S3Context { Aws::S3::S3Client _client; }; -/** - * @brief Given a file path like "s3:///", return the name of the bucket and object. - * - * @param path S3 file path. - * @return Pair of strings: [bucket-name, object-name]. - */ -inline std::pair parse_s3_path(const std::string& path) -{ - if (path.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } - if (path.size() < 5 || path.substr(0, 5) != "s3://") { - throw std::invalid_argument("The remote path must start with the S3 scheme (\"s3://\")."); - } - std::string p = path.substr(5); - if (p.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } - size_t pos = p.find_first_of('/'); - if (pos == 0) { throw std::invalid_argument("The remote path does not contain a bucket name."); } - return std::make_pair(p.substr(0, pos), (pos == std::string::npos) ? "" : p.substr(pos + 1)); -} - -} // namespace detail - /** * @brief Handle of remote file (currently, only AWS S3 is supported). * @@ -191,7 +191,7 @@ class RemoteHandle { RemoteHandle(std::string bucket_name, std::string object_name) : _bucket_name(std::move(bucket_name)), _object_name(std::move(object_name)), - _nbytes(detail::S3Context::default_context().get_file_size(_bucket_name, _object_name)) + _nbytes(S3Context::default_context().get_file_size(_bucket_name, _object_name)) { } @@ -205,7 +205,7 @@ class RemoteHandle { auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); _bucket_name = std::move(bucket_name); _object_name = std::move(object_name); - _nbytes = detail::S3Context::default_context().get_file_size(_bucket_name, _object_name); + _nbytes = S3Context::default_context().get_file_size(_bucket_name, _object_name); } /** @@ -229,7 +229,7 @@ class RemoteHandle { { KVIKIO_NVTX_FUNC_RANGE("AWS S3 receive", size); - auto& default_context = detail::S3Context::default_context(); + auto& default_context = S3Context::default_context(); Aws::S3::Model::GetObjectRequest req; req.SetBucket(_bucket_name.c_str()); req.SetKey(_object_name.c_str()); From 3f8291ae8a1ce11133a34ef8a4c1e59978d24cf6 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 08:24:45 +0200 Subject: [PATCH 30/88] cpp_RemoteHandle --- python/kvikio/kvikio/_lib/remote_handle.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index 1fa436b01c..a9c0c39c2f 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -14,14 +14,14 @@ from kvikio._lib.arr cimport parse_buffer_argument from kvikio._lib.future cimport IOFuture, _wrap_io_future, future -cdef extern from "" namespace "kvikio" nogil: - cdef cppclass RemoteHandle: - RemoteHandle() except + - RemoteHandle( +cdef extern from "" nogil: + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": + cpp_RemoteHandle() except + + cpp_RemoteHandle( string bucket_name, string object_name, ) except + - RemoteHandle( + cpp_RemoteHandle( string remote_path, ) except + int nbytes() @@ -39,12 +39,12 @@ cdef extern from "" namespace "kvikio" nogil: cdef class RemoteFile: """ Remote file handle""" - cdef RemoteHandle _handle + cdef cpp_RemoteHandle _handle @classmethod def from_bucket_and_object(cls, bucket_name: str, object_name: str): cdef RemoteFile ret = RemoteFile() - ret._handle = RemoteHandle( + ret._handle = cpp_RemoteHandle( str.encode(str(bucket_name)), str.encode(str(object_name)), ) @@ -53,7 +53,7 @@ cdef class RemoteFile: @classmethod def from_url(cls, url: str): cdef RemoteFile ret = RemoteFile() - ret._handle = RemoteHandle(str.encode(str(url))) + ret._handle = cpp_RemoteHandle(str.encode(str(url))) return ret def nbytes(self) -> int: From 2607ca1ba9131da93d7e1f4df6542024a1b335a5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 08:43:56 +0200 Subject: [PATCH 31/88] _get_remote_module --- python/kvikio/kvikio/remote_file.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index cbda20d8e5..c9189a23db 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -9,6 +9,7 @@ def is_remote_file_available() -> bool: + """Check if the remote module is available""" try: import kvikio._lib.remote_handle # noqa: F401 except ImportError: @@ -17,14 +18,15 @@ def is_remote_file_available() -> bool: return True -def _get_remote_remote_file_class(): +def _get_remote_module(): + """Get the remote module or raise an error""" if not is_remote_file_available(): raise RuntimeError( "RemoteFile not available, please build KvikIO with AWS S3 support" ) import kvikio._lib.remote_handle - return kvikio._lib.remote_handle.RemoteFile + return kvikio._lib.remote_handle class RemoteFile: @@ -48,7 +50,7 @@ def __init__(self, bucket_name: str, object_name: str): object_name Name of the object. """ - self._handle = _get_remote_remote_file_class().from_bucket_and_object( + self._handle = _get_remote_module().RemoteFile.from_bucket_and_object( bucket_name, object_name ) @@ -66,7 +68,7 @@ def from_url(cls, url: str) -> RemoteFile: A newly opened remote file """ ret = object.__new__(cls) - ret._handle = _get_remote_remote_file_class().from_url(url) + ret._handle = _get_remote_module().RemoteFile.from_url(url) return ret def __enter__(self) -> RemoteFile: From 6f3c25d2d9644f826cfb842ec7ec398efd16a285 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 14:15:28 +0200 Subject: [PATCH 32/88] make S3Context public --- cpp/include/kvikio/remote_handle.hpp | 105 ++++++++----------- python/kvikio/kvikio/__init__.py | 3 +- python/kvikio/kvikio/_lib/remote_handle.pyx | 24 ++++- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 5 +- python/kvikio/kvikio/remote_file.py | 13 ++- python/kvikio/tests/test_aws_s3.py | 16 +-- 6 files changed, 88 insertions(+), 78 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 919e861374..45e2098f5b 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -71,26 +71,44 @@ inline std::pair parse_s3_path(const std::string& path * @brief S3 context, which initializes and maintains the S3 API and client. */ class S3Context { - public: - S3Context() : _client{S3Context::create_client()} {} + private: + Aws::S3::S3Client _client; /** - * @brief Get a reference to the S3 client. + * @brief Create a new S3 client * - * @return S3 client. + * @return The new client */ - Aws::S3::S3Client& client() { return _client; } + static Aws::S3::S3Client create_client() + { + Aws::SDKOptions options; + Aws::InitAPI(options); // Should only be called once. + + // Read AWS_ENDPOINT_URL to overwrite endpoint + Aws::Client::ClientConfiguration clientConfig; + const char* ep = std::getenv("AWS_ENDPOINT_URL"); + if (ep != nullptr) { clientConfig.endpointOverride = ep; } + + // We check authentication here to trigger an early exception. + Aws::Auth::DefaultAWSCredentialsProviderChain provider; + if (provider.GetAWSCredentials().IsEmpty()) { + throw std::runtime_error(std::string("Failed authentication to ") + ep); + } + return Aws::S3::S3Client(Aws::S3::S3Client(clientConfig)); + } + + public: + S3Context() : _client{S3Context::create_client()} + { + std::cout << "S3Context - name: " << _client.GetServiceName() << std::endl; + } /** - * @brief Get the default context, which is created on first call. + * @brief Get a reference to the S3 client. * - * @return The default S3 context. + * @return S3 client. */ - static S3Context& default_context() - { - static S3Context _default_context; - return _default_context; - } + Aws::S3::S3Client& client() { return _client; } // No copy semantic S3Context(S3Context const&) = delete; @@ -117,49 +135,6 @@ class S3Context { } return outcome.GetResult().GetContentLength(); } - - private: - /** - * @brief Initialize the S3 API (idempotent) - * - * This private function is called as part of `S3Context` creation and it makes sure to only call - * `Aws::InitAPI()` once. - */ - static void ensure_aws_s3_api_init() - { - static bool not_initalized{true}; - if (not_initalized) { - not_initalized = false; - - Aws::SDKOptions options; - // options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Error; - Aws::InitAPI(options); // Should only be called once. - } - } - - /** - * @brief Create a new S3 client - * - * @return The new client - */ - static Aws::S3::S3Client create_client() - { - S3Context::ensure_aws_s3_api_init(); - - // Read AWS_ENDPOINT_URL to overwrite endpoint - Aws::Client::ClientConfiguration clientConfig; - const char* ep = std::getenv("AWS_ENDPOINT_URL"); - if (ep != nullptr) { clientConfig.endpointOverride = ep; } - - // We check authentication here to trigger an early exception. - Aws::Auth::DefaultAWSCredentialsProviderChain provider; - if (provider.GetAWSCredentials().IsEmpty()) { - throw std::runtime_error(std::string("Failed authentication to ") + ep); - } - return Aws::S3::S3Client(Aws::S3::S3Client(clientConfig)); - } - - Aws::S3::S3Client _client; }; /** @@ -177,6 +152,7 @@ class RemoteHandle { std::string _bucket_name{}; std::string _object_name{}; std::size_t _nbytes{}; + std::shared_ptr _context; public: // Use of a default constructed instance is undefined behavior. @@ -188,11 +164,13 @@ class RemoteHandle { * @param bucket_name Name of the bucket. * @param object_name Name of the object. */ - RemoteHandle(std::string bucket_name, std::string object_name) - : _bucket_name(std::move(bucket_name)), - _object_name(std::move(object_name)), - _nbytes(S3Context::default_context().get_file_size(_bucket_name, _object_name)) + RemoteHandle(std::shared_ptr context, std::string bucket_name, std::string object_name) { + if (!context) { throw std::invalid_argument("context cannot be null"); } + _context = std::move(context); + _bucket_name = std::move(bucket_name); + _object_name = std::move(object_name); + _nbytes = _context->get_file_size(_bucket_name, _object_name); } /** @@ -200,12 +178,14 @@ class RemoteHandle { * * @param remote_path Remote file path. */ - RemoteHandle(const std::string& remote_path) + RemoteHandle(std::shared_ptr context, const std::string& remote_path) { + if (!context) { throw std::invalid_argument("context cannot be null"); } + _context = std::move(context); auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); _bucket_name = std::move(bucket_name); _object_name = std::move(object_name); - _nbytes = S3Context::default_context().get_file_size(_bucket_name, _object_name); + _nbytes = _context->get_file_size(_bucket_name, _object_name); } /** @@ -229,7 +209,6 @@ class RemoteHandle { { KVIKIO_NVTX_FUNC_RANGE("AWS S3 receive", size); - auto& default_context = S3Context::default_context(); Aws::S3::Model::GetObjectRequest req; req.SetBucket(_bucket_name.c_str()); req.SetKey(_object_name.c_str()); @@ -242,7 +221,7 @@ class RemoteHandle { req.SetResponseStreamFactory( [&]() { return Aws::New("BufferAsStream", &buf_stream); }); - Aws::S3::Model::GetObjectOutcome outcome = default_context.client().GetObject(req); + Aws::S3::Model::GetObjectOutcome outcome = _context->client().GetObject(req); if (!outcome.IsSuccess()) { const Aws::S3::S3Error& err = outcome.GetError(); throw std::runtime_error(err.GetExceptionName() + ": " + err.GetMessage()); diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py index e3bb0ffd37..4bd17e5c8c 100644 --- a/python/kvikio/kvikio/__init__.py +++ b/python/kvikio/kvikio/__init__.py @@ -4,7 +4,7 @@ from kvikio._lib import buffer, driver_properties # type: ignore from kvikio._version import __git_commit__, __version__ from kvikio.cufile import CuFile -from kvikio.remote_file import RemoteFile, is_remote_file_available +from kvikio.remote_file import RemoteFile, S3Context, is_remote_file_available def memory_register(buf) -> None: @@ -24,5 +24,6 @@ def memory_deregister(buf) -> None: "__version__", "CuFile", "RemoteFile", + "S3Context", "is_remote_file_available", ] diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index a9c0c39c2f..f654019c81 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -7,6 +7,7 @@ from typing import Optional from libc.stdint cimport uintptr_t +from libcpp.memory cimport make_shared, shared_ptr from libcpp.string cimport string from libcpp.utility cimport pair @@ -15,13 +16,19 @@ from kvikio._lib.future cimport IOFuture, _wrap_io_future, future cdef extern from "" nogil: + + cdef cppclass cpp_S3Context "kvikio::S3Context": + cpp_S3Context() except + + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": cpp_RemoteHandle() except + cpp_RemoteHandle( + shared_ptr[cpp_S3Context] context, string bucket_name, string object_name, ) except + cpp_RemoteHandle( + shared_ptr[cpp_S3Context] context, string remote_path, ) except + int nbytes() @@ -36,24 +43,35 @@ cdef extern from "" nogil: size_t file_offset ) except + +cdef class S3Context: + cdef shared_ptr[cpp_S3Context] _handle + + def __init__(self): + self._handle = make_shared[cpp_S3Context]() cdef class RemoteFile: """ Remote file handle""" cdef cpp_RemoteHandle _handle @classmethod - def from_bucket_and_object(cls, bucket_name: str, object_name: str): + def from_bucket_and_object( + cls, + S3Context context, + bucket_name: str, + object_name: str + ): cdef RemoteFile ret = RemoteFile() ret._handle = cpp_RemoteHandle( + context._handle, str.encode(str(bucket_name)), str.encode(str(object_name)), ) return ret @classmethod - def from_url(cls, url: str): + def from_url(cls, S3Context context, url: str): cdef RemoteFile ret = RemoteFile() - ret._handle = cpp_RemoteHandle(str.encode(str(url))) + ret._handle = cpp_RemoteHandle(context._handle, str.encode(str(url))) return ret def nbytes(self) -> int: diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index 989e482d58..9d86aa5f8e 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -82,10 +82,13 @@ def run_numpy_like(args, xp): client = create_client_and_bucket() client.put_object(Bucket=args.bucket, Key="data1", Body=bytes(data)) + context = kvikio.S3Context() def run() -> float: t0 = time.perf_counter() - with kvikio.RemoteFile(bucket_name=args.bucket, object_name="data1") as f: + with kvikio.RemoteFile( + context=context, bucket_name=args.bucket, object_name="data1" + ) as f: res = f.read(recv) t1 = time.perf_counter() assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}" diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index c9189a23db..35cca41a0c 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -29,6 +29,11 @@ def _get_remote_module(): return kvikio._lib.remote_handle +class S3Context: + def __init__(self): + self._handle = _get_remote_module().S3Context() + + class RemoteFile: """File handle of a remote file (currently, only AWS S3 is supported). @@ -40,7 +45,7 @@ class RemoteFile: . """ - def __init__(self, bucket_name: str, object_name: str): + def __init__(self, context: S3Context, bucket_name: str, object_name: str): """Open a remote file given a bucket and object name. Parameters @@ -51,11 +56,11 @@ def __init__(self, bucket_name: str, object_name: str): Name of the object. """ self._handle = _get_remote_module().RemoteFile.from_bucket_and_object( - bucket_name, object_name + context._handle, bucket_name, object_name ) @classmethod - def from_url(cls, url: str) -> RemoteFile: + def from_url(cls, context: S3Context, url: str) -> RemoteFile: """Open a remote file given an url such as "s3:///". Parameters @@ -68,7 +73,7 @@ def from_url(cls, url: str) -> RemoteFile: A newly opened remote file """ ret = object.__new__(cls) - ret._handle = _get_remote_module().RemoteFile.from_url(url) + ret._handle = _get_remote_module().RemoteFile.from_url(context._handle, url) return ret def __enter__(self) -> RemoteFile: diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 64e139da90..4962ea1466 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -91,7 +91,7 @@ def s3_context(s3_base, bucket, files=None): client.create_bucket(Bucket=bucket, ACL="public-read-write") for f, data in files.items(): client.put_object(Bucket=bucket, Key=f, Body=data) - yield + yield kvikio.S3Context() for f, data in files.items(): try: client.delete_object(Bucket=bucket, Key=f) @@ -103,8 +103,10 @@ def test_read(s3_base, xp): bucket_name = "test_read" object_name = "a1" a = xp.arange(10_000_000) - with s3_context(s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}): - with kvikio.RemoteFile(bucket_name, object_name) as f: + with s3_context( + s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)} + ) as ctx: + with kvikio.RemoteFile(ctx, bucket_name, object_name) as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) assert f.read(buf=b) == a.nbytes @@ -124,13 +126,15 @@ def test_read_with_file_offset(s3_base, xp, start, end): bucket_name = "test_read" object_name = "a1" a = xp.arange(end, dtype=xp.int64) - with s3_context(s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}): - with kvikio.RemoteFile(bucket_name, object_name) as f: + with s3_context( + s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)} + ) as ctx: + with kvikio.RemoteFile(ctx, bucket_name, object_name) as f: b = xp.zeros(shape=(end - start,), dtype=xp.int64) assert f.read(b, file_offset=start * a.itemsize) == b.nbytes xp.testing.assert_array_equal(a[start:end], b) - with kvikio.RemoteFile.from_url(f"s3://{bucket_name}/{object_name}") as f: + with kvikio.RemoteFile.from_url(ctx, f"s3://{bucket_name}/{object_name}") as f: b = xp.zeros(shape=(end - start,), dtype=xp.int64) assert f.read(b, file_offset=start * a.itemsize) == b.nbytes xp.testing.assert_array_equal(a[start:end], b) From 149d6be70d726c509de1d15b303ff48b1745e9ea Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 14:44:30 +0200 Subject: [PATCH 33/88] call Aws::ShutdownAPI() --- cpp/include/kvikio/remote_handle.hpp | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 45e2098f5b..fe9a084054 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -68,7 +68,16 @@ inline std::pair parse_s3_path(const std::string& path } // namespace detail /** - * @brief S3 context, which initializes and maintains the S3 API and client. + * @brief S3 context, which initializes and maintains the S3 SDK and client. + * + * Because S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, this class inherit some + * limitations from the SDK. + * - The SDK for C++ and its dependencies use C++ static objects, and the order of static object + * destruction is not determined by the C++ standard. To avoid memory issues caused by the + * nondeterministic order of static variable destruction, do not wrap `S3Context` in another + * static object. + * - Please construct and destruct `S3Context` from the same thread (use a dedicated thread if + * necessary). This avoids problems in initializing the dependent Common RunTime C libraries. */ class S3Context { private: @@ -81,8 +90,10 @@ class S3Context { */ static Aws::S3::S3Client create_client() { + // Notice, the S3 SDK allows multiple calls to `Aws::InitAPI`, see: + // Aws::SDKOptions options; - Aws::InitAPI(options); // Should only be called once. + Aws::InitAPI(options); // Read AWS_ENDPOINT_URL to overwrite endpoint Aws::Client::ClientConfiguration clientConfig; @@ -98,9 +109,16 @@ class S3Context { } public: - S3Context() : _client{S3Context::create_client()} + S3Context() : _client{S3Context::create_client()} {} + + ~S3Context() noexcept { - std::cout << "S3Context - name: " << _client.GetServiceName() << std::endl; + try { + Aws::SDKOptions options; + Aws::ShutdownAPI(options); + } catch (const std::exception& e) { + std::cerr << "~S3Context(): " << e.what() << std::endl; + } } /** From 98991b315e8d3200cf4a7c2ae3ae0e2347604837 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 14:59:56 +0200 Subject: [PATCH 34/88] clean up --- cpp/include/kvikio/remote_handle.hpp | 33 ++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index fe9a084054..65f9938acb 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -176,34 +177,44 @@ class RemoteHandle { // Use of a default constructed instance is undefined behavior. RemoteHandle() noexcept = default; + /** + * @brief Construct from a bucket and object name pair. + * + * @param context The S3 context used for the connection to the remove server. + * @param bucket_and_object_name Name pair . + */ + RemoteHandle(std::shared_ptr context, + std::pair bucket_and_object_name) + { + if (!context) { throw std::invalid_argument("RemoteHandle(): context cannot be null"); } + _context = std::move(context); + _bucket_name = std::move(bucket_and_object_name.first); + _object_name = std::move(bucket_and_object_name.second); + _nbytes = _context->get_file_size(_bucket_name, _object_name); + } + /** * @brief Construct from a bucket and object name. * + * @param context The S3 context used for the connection to the remove server. * @param bucket_name Name of the bucket. * @param object_name Name of the object. */ RemoteHandle(std::shared_ptr context, std::string bucket_name, std::string object_name) + : RemoteHandle(std::move(context), + std::make_pair(std::move(bucket_name), std::move(object_name))) { - if (!context) { throw std::invalid_argument("context cannot be null"); } - _context = std::move(context); - _bucket_name = std::move(bucket_name); - _object_name = std::move(object_name); - _nbytes = _context->get_file_size(_bucket_name, _object_name); } /** * @brief Construct from a remote path such as "s3:///". * + * @param context The S3 context used for the connection to the remove server. * @param remote_path Remote file path. */ RemoteHandle(std::shared_ptr context, const std::string& remote_path) + : RemoteHandle(std::move(context), detail::parse_s3_path(remote_path)) { - if (!context) { throw std::invalid_argument("context cannot be null"); } - _context = std::move(context); - auto [bucket_name, object_name] = detail::parse_s3_path(remote_path); - _bucket_name = std::move(bucket_name); - _object_name = std::move(object_name); - _nbytes = _context->get_file_size(_bucket_name, _object_name); } /** From daac52f752d37d4f0595141b551d8622ddaff634 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 15:03:48 +0200 Subject: [PATCH 35/88] doc --- python/kvikio/kvikio/_lib/remote_handle.pyx | 1 - python/kvikio/kvikio/remote_file.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index f654019c81..5069c5e83d 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -50,7 +50,6 @@ cdef class S3Context: self._handle = make_shared[cpp_S3Context]() cdef class RemoteFile: - """ Remote file handle""" cdef cpp_RemoteHandle _handle @classmethod diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 35cca41a0c..ff9229f297 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -30,6 +30,8 @@ def _get_remote_module(): class S3Context: + """S3 context, which initializes and maintains the S3 SDK and client.""" + def __init__(self): self._handle = _get_remote_module().S3Context() @@ -50,6 +52,8 @@ def __init__(self, context: S3Context, bucket_name: str, object_name: str): Parameters ---------- + context + The S3 context used for the connection to the remove server. bucket_name Name of the bucket. object_name @@ -65,6 +69,8 @@ def from_url(cls, context: S3Context, url: str) -> RemoteFile: Parameters ---------- + context + The S3 context used for the connection to the remove server. url URL to the remote file. From ba6925ce57e854a9d5435cdf062d1643a8108295 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 21 Aug 2024 18:53:41 +0200 Subject: [PATCH 36/88] doc --- cpp/include/kvikio/remote_handle.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 65f9938acb..285c1c5151 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -245,7 +245,9 @@ class RemoteHandle { "bytes=" + std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); req.SetRange(byte_range.c_str()); - // To write directly to `buf`, we register a "factory" that wraps a buffer as a output stream. + // To write directly to `buf`, we register a "factory" that wraps a buffer as an output stream. + // Notice, the AWS SDK will handle the freeing of the returned `detail::BufferAsStream`: + // Aws::Utils::Stream::PreallocatedStreamBuf buf_stream(static_cast(buf), size); req.SetResponseStreamFactory( [&]() { return Aws::New("BufferAsStream", &buf_stream); }); From c6355193d1fbf42389eb4ba307720eb8a3eaa452 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 13:13:21 +0200 Subject: [PATCH 37/88] S3Context: endpoint_override --- cpp/include/kvikio/remote_handle.hpp | 90 ++++++++++++++------- python/kvikio/kvikio/_lib/remote_handle.pyx | 9 ++- python/kvikio/kvikio/remote_file.py | 33 +++++--- python/kvikio/tests/test_aws_s3.py | 5 +- 4 files changed, 90 insertions(+), 47 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 285c1c5151..b340418ee6 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -71,8 +71,8 @@ inline std::pair parse_s3_path(const std::string& path /** * @brief S3 context, which initializes and maintains the S3 SDK and client. * - * Because S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, this class inherit some - * limitations from the SDK. + * If not given an existing S3 client, S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, + * which inherit some limitations from the SDK. * - The SDK for C++ and its dependencies use C++ static objects, and the order of static object * destruction is not determined by the C++ standard. To avoid memory issues caused by the * nondeterministic order of static variable destruction, do not wrap `S3Context` in another @@ -82,43 +82,76 @@ inline std::pair parse_s3_path(const std::string& path */ class S3Context { private: - Aws::S3::S3Client _client; + // We use a shared point since constructing a default `Aws::S3::S3Client` before calling + // `Aws::InitAPI` is illegal. + std::shared_ptr _client; + // Only call `Aws::ShutdownAPI`, if `Aws::InitAPI` was called on construction. + const bool _shutdown_s3_api; + + public: + /** + * @brief Create a context given an existing S3 client + * + * The S3 SDK isn't initialized. + * + * @param client The S3 client + */ + S3Context(std::shared_ptr client) + : _client{std::move(client)}, _shutdown_s3_api{false} + { + if (!_client) { throw std::invalid_argument("S3Context(): S3 client cannot be null"); } + } /** - * @brief Create a new S3 client + * @brief Create a new context with a newly created S3 client. * - * @return The new client + * The S3 SDK is automatically initialized on construction and shutdown on destruction. + * + * The new S3 client use the default `Aws::Client::ClientConfiguration`, thus please make sure + * that AWS credentials have been configure on the system. A common way to do this, is to set the + * environment variables: `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + * + * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see + * . + * + * @param endpoint_override If not empty, the address of the S3 server. Takes precedences + * over the `AWS_ENDPOINT_URL` environment variable. */ - static Aws::S3::S3Client create_client() + S3Context(const std::string& endpoint_override = "") : _shutdown_s3_api{true} { - // Notice, the S3 SDK allows multiple calls to `Aws::InitAPI`, see: + // NB: `Aws::InitAPI` has to be called before everything in the SDK beside `Aws::SDKOptions`, + // even before config structs like `Aws::Client::ClientConfiguration`. + // Notice, we may call `Aws::InitAPI`, which is allowed see: // Aws::SDKOptions options; Aws::InitAPI(options); - // Read AWS_ENDPOINT_URL to overwrite endpoint - Aws::Client::ClientConfiguration clientConfig; + // Create a client config where `endpoint_override` takes precedences over `AWS_ENDPOINT_URL` + Aws::Client::ClientConfiguration config; const char* ep = std::getenv("AWS_ENDPOINT_URL"); - if (ep != nullptr) { clientConfig.endpointOverride = ep; } + if (!endpoint_override.empty()) { + config.endpointOverride = endpoint_override; + } else if (ep != nullptr && !std::string(ep).empty()) { + config.endpointOverride = ep; + } // We check authentication here to trigger an early exception. Aws::Auth::DefaultAWSCredentialsProviderChain provider; if (provider.GetAWSCredentials().IsEmpty()) { - throw std::runtime_error(std::string("Failed authentication to ") + ep); + throw std::runtime_error("failed authentication to S3 server"); } - return Aws::S3::S3Client(Aws::S3::S3Client(clientConfig)); + _client = std::make_shared(config); } - public: - S3Context() : _client{S3Context::create_client()} {} - ~S3Context() noexcept { - try { - Aws::SDKOptions options; - Aws::ShutdownAPI(options); - } catch (const std::exception& e) { - std::cerr << "~S3Context(): " << e.what() << std::endl; + if (_shutdown_s3_api) { + try { + Aws::SDKOptions options; + Aws::ShutdownAPI(options); + } catch (const std::exception& e) { + std::cerr << "~S3Context(): " << e.what() << std::endl; + } } } @@ -127,11 +160,13 @@ class S3Context { * * @return S3 client. */ - Aws::S3::S3Client& client() { return _client; } + Aws::S3::S3Client& client() { return *_client; } - // No copy semantic - S3Context(S3Context const&) = delete; - void operator=(S3Context const&) = delete; + // No copy and move semantic + S3Context(S3Context const&) = delete; + void operator=(S3Context const&) = delete; + S3Context(S3Context const&&) = delete; + void operator=(S3Context const&&) = delete; /** * @brief Get the size of a S3 file @@ -158,13 +193,6 @@ class S3Context { /** * @brief Handle of remote file (currently, only AWS S3 is supported). - * - * Please make sure that AWS credentials have been configure on the system. - * A common way to do this, is to set the environment variables: - * `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. - * - * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see - * . */ class RemoteHandle { private: diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index 5069c5e83d..d704d6f8c2 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -19,6 +19,7 @@ cdef extern from "" nogil: cdef cppclass cpp_S3Context "kvikio::S3Context": cpp_S3Context() except + + cpp_S3Context(string endpoint_override) except + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": cpp_RemoteHandle() except + @@ -46,8 +47,12 @@ cdef extern from "" nogil: cdef class S3Context: cdef shared_ptr[cpp_S3Context] _handle - def __init__(self): - self._handle = make_shared[cpp_S3Context]() + def __init__(self, endpoint_override: Optional[str]): + if endpoint_override is None: + self._handle = make_shared[cpp_S3Context]() + return + cdef string s = str.encode(str(endpoint_override)) + self._handle = make_shared[cpp_S3Context](s) cdef class RemoteFile: cdef cpp_RemoteHandle _handle diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index ff9229f297..bacc6b0b73 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -30,22 +30,33 @@ def _get_remote_module(): class S3Context: - """S3 context, which initializes and maintains the S3 SDK and client.""" + def __init__(self, endpoint_override: Optional[str] = None): + """S3 context, which initializes and maintains the S3 SDK and client. - def __init__(self): - self._handle = _get_remote_module().S3Context() + The S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, which inherit + some limitations from the SDK: + - Please construct and destruct `S3Context` from the same thread (use a + dedicated thread if necessary). This avoids problems in initializing + the dependent Common RunTime C libraries. + Please make sure that AWS credentials have been configured on the system. + A common way to do this, is to set the environment variables: + `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. -class RemoteFile: - """File handle of a remote file (currently, only AWS S3 is supported). + Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see + . + + Parameters + ---------- + endpoint_override + If not empty, the address of the S3 server. Takes precedences over the + `AWS_ENDPOINT_URL` environment variable. + """ + self._handle = _get_remote_module().S3Context(endpoint_override) - Please make sure that AWS credentials have been configured on the system. - A common way to do this, is to set the environment variables: - `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. - Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see - . - """ +class RemoteFile: + """File handle of a remote file (currently, only AWS S3 is supported).""" def __init__(self, context: S3Context, bucket_name: str, object_name: str): """Open a remote file given a bucket and object name. diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 4962ea1466..a4569b7601 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -74,11 +74,10 @@ def s3_base(endpoint_ip, endpoint_port): os.environ["AWS_SECURITY_TOKEN"] = "foobar_security_token" os.environ["AWS_SESSION_TOKEN"] = "foobar_session_token" os.environ["AWS_DEFAULT_REGION"] = "us-east-1" - os.environ["AWS_ENDPOINT_URL"] = f"http://{endpoint_ip}:{endpoint_port}" p = mp.Process(target=start_s3_server, args=(endpoint_ip, endpoint_port)) p.start() - yield os.environ["AWS_ENDPOINT_URL"] + yield f"http://{endpoint_ip}:{endpoint_port}" p.kill() @@ -91,7 +90,7 @@ def s3_context(s3_base, bucket, files=None): client.create_bucket(Bucket=bucket, ACL="public-read-write") for f, data in files.items(): client.put_object(Bucket=bucket, Key=f, Body=data) - yield kvikio.S3Context() + yield kvikio.S3Context(s3_base) for f, data in files.items(): try: client.delete_object(Bucket=bucket, Key=f) From b8bfef2a238ef3fdbbae47f239a0cedc4b430007 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 13:48:33 +0200 Subject: [PATCH 38/88] examples/aws_s3.py --- python/kvikio/examples/aws_s3.py | 43 ++++++++++++++++++++++++++++ python/kvikio/tests/test_examples.py | 19 +++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 python/kvikio/examples/aws_s3.py diff --git a/python/kvikio/examples/aws_s3.py b/python/kvikio/examples/aws_s3.py new file mode 100644 index 0000000000..03f1d416b1 --- /dev/null +++ b/python/kvikio/examples/aws_s3.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import os + +import boto3 +import cupy + +import kvikio +from kvikio.benchmarks.aws_s3_io import get_local_port, local_s3_server + + +def main(): + a = cupy.arange(100) + b = cupy.empty_like(a) + + # In this example, we launch and use a local S3 server with the + # following available address: + endpoint_url = f"http://127.0.0.1:{get_local_port()}" + + # In order use a local server instead of an official Amazon S3 server, + # we set the AWS_ENDPOINT_URL environment variable. + os.environ["AWS_ENDPOINT_URL"] = endpoint_url + + # Start a local S3 server + with local_s3_server(lifetime=100): + # Create the bucket "my-bucket" and the object "data" + client = boto3.client("s3", endpoint_url=endpoint_url) + client.create_bucket(Bucket="my-bucket", ACL="public-read-write") + client.put_object(Bucket="my-bucket", Key="data", Body=bytes(a)) + + # Create a S3 context that connects to AWS_ENDPOINT_URL + context = kvikio.S3Context() + + # Using the context, we can open "data" as if it was a regular CuFile + with kvikio.RemoteFile(context, "my-bucket", "data") as f: + f.read(b) + print(a) + print(b) + + +if __name__ == "__main__": + main() diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py index e9e1f83d08..8956ba7bd5 100644 --- a/python/kvikio/tests/test_examples.py +++ b/python/kvikio/tests/test_examples.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. # See file LICENSE for terms. import os @@ -7,6 +7,8 @@ import pytest +import kvikio + examples_path = Path(os.path.realpath(__file__)).parent / ".." / "examples" @@ -26,3 +28,18 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch): monkeypatch.syspath_prepend(str(examples_path)) import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file") + + +def test_aws_s3(monkeypatch): + """Test examples/aws_s3.py""" + + if not kvikio.is_remote_file_available(): + pytest.skip( + "cannot test remote IO, please build KvikIO with with AWS S3 support" + ) + # Fail early if dependencies isn't available + import boto3 # noqa: F401 + import moto # noqa: F401 + + monkeypatch.syspath_prepend(str(examples_path)) + import_module("aws_s3").main() From 4c4b36e0dc8339e600aec1129b341354f8c6c9b0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 14:24:31 +0200 Subject: [PATCH 39/88] doc --- README.md | 82 ++++++++++++++++++++++++++++- docs/source/api.rst | 10 ++++ docs/source/conf.py | 8 +++ docs/source/index.rst | 1 + docs/source/remote_file.rst | 7 +++ python/kvikio/kvikio/defaults.py | 16 +++--- python/kvikio/kvikio/remote_file.py | 8 +-- 7 files changed, 119 insertions(+), 13 deletions(-) create mode 100644 docs/source/remote_file.rst diff --git a/README.md b/README.md index 4df538aa2d..f4807f1b90 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,89 @@ The C++ library is header-only making it easy to include in [existing projects]( * A Python [Zarr](https://zarr.readthedocs.io/en/stable/) backend for reading and writing GPU data to file seamlessly. * Concurrent reads and writes using an internal thread pool. * Non-blocking API. -* Handle both host and device IO seamlessly. +* Read/write to both host and device memory seamlessly. +* Provides remote read from AWS S3 storage seamlessly. * Provides Python bindings to [nvCOMP](https://github.com/NVIDIA/nvcomp). + ### Documentation * Python: * C++: + + +### Examples + +#### Python +```python +import cupy +import kvikio + +def main(path): + a = cupy.arange(100) + f = kvikio.CuFile(path, "w") + # Write whole array to file + f.write(a) + f.close() + + b = cupy.empty_like(a) + f = kvikio.CuFile(path, "r") + # Read whole array from file + f.read(b) + assert all(a == b) + + # Use contexmanager + c = cupy.empty_like(a) + with kvikio.CuFile(path, "r") as f: + f.read(c) + assert all(a == c) + + # Non-blocking read + d = cupy.empty_like(a) + with kvikio.CuFile(path, "r") as f: + future1 = f.pread(d[:50]) + future2 = f.pread(d[50:], file_offset=d[:50].nbytes) + future1.get() # Wait for first read + future2.get() # Wait for second read + assert all(a == d) + + +if __name__ == "__main__": + main("/tmp/kvikio-hello-world-file") +``` + +#### C++ +```c++ +#include +#include +#include +using namespace std; + +int main() +{ + // Create two arrays `a` and `b` + constexpr std::size_t size = 100; + void *a = nullptr; + void *b = nullptr; + cudaMalloc(&a, size); + cudaMalloc(&b, size); + + // Write `a` to file + kvikio::FileHandle fw("test-file", "w"); + size_t written = fw.write(a, size); + fw.close(); + + // Read file into `b` + kvikio::FileHandle fr("test-file", "r"); + size_t read = fr.read(b, size); + fr.close(); + + // Read file into `b` in parallel using 16 threads + kvikio::default_thread_pool::reset(16); + { + kvikio::FileHandle f("test-file", "r"); + future future = f.pread(b_dev, sizeof(a), 0); // Non-blocking + size_t read = future.get(); // Blocking + // Notice, `f` closes automatically on destruction. + } +} +``` diff --git a/docs/source/api.rst b/docs/source/api.rst index 4d19c09bbb..a06f9f2d58 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -18,6 +18,16 @@ Zarr .. autoclass:: GDSStore :members: +RemoteFile +---------- +.. currentmodule:: kvikio.remote_file + +.. autoclass:: S3Context + :members: + +.. autoclass:: RemoteFile + :members: + Defaults -------- .. currentmodule:: kvikio.defaults diff --git a/docs/source/conf.py b/docs/source/conf.py index 089a8033f6..603b6736e8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -82,6 +82,14 @@ pygments_style = None +autodoc_default_options = { + 'members': True, + 'member-order': 'bysource', + 'special-members': '__init__', + 'undoc-members': True, + 'exclude-members': '__weakref__' +} + # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/source/index.rst b/docs/source/index.rst index 4dd491fd96..9e302b5f44 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,6 +23,7 @@ Contents install quickstart zarr + remote_file runtime_settings api genindex diff --git a/docs/source/remote_file.rst b/docs/source/remote_file.rst new file mode 100644 index 0000000000..b1bc9c585b --- /dev/null +++ b/docs/source/remote_file.rst @@ -0,0 +1,7 @@ +Remote File +=========== + +KvikIO provides direct access to `AWS S3 storage `_. + +.. literalinclude:: ../../python/kvikio/examples/aws_s3.py + :language: python diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py index ce66cc70f4..2f92eb9e76 100644 --- a/python/kvikio/kvikio/defaults.py +++ b/python/kvikio/kvikio/defaults.py @@ -23,8 +23,8 @@ def compat_mode() -> bool: - when `/run/udev` isn't readable, which typically happens when running inside a docker image not launched with `--volume /run/udev:/run/udev:ro` - Return - ------ + Returns + ------- bool Whether KvikIO is running in compatibility mode or not. """ @@ -68,8 +68,8 @@ def get_num_threads() -> int: Set the default value using `num_threads_reset()` or by setting the `KVIKIO_NTHREADS` environment variable. If not set, the default value is 1. - Return - ------ + Returns + ------- nthreads: int The number of threads in the current thread pool. """ @@ -119,8 +119,8 @@ def task_size() -> int: the `KVIKIO_TASK_SIZE` environment variable. If not set, the default value is 4 MiB. - Return - ------ + Returns + ------- nbytes: int The default task size in bytes. """ @@ -166,8 +166,8 @@ def gds_threshold() -> int: `KVIKIO_TASK_SIZE` environment variable. If not set, the default value is 1 MiB. - Return - ------ + Returns + ------- nbytes : int The default GDS threshold size in bytes. """ diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index bacc6b0b73..b950da2b32 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -34,10 +34,9 @@ def __init__(self, endpoint_override: Optional[str] = None): """S3 context, which initializes and maintains the S3 SDK and client. The S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, which inherit - some limitations from the SDK: - - Please construct and destruct `S3Context` from the same thread (use a - dedicated thread if necessary). This avoids problems in initializing - the dependent Common RunTime C libraries. + some limitations from the SDK: please construct and destruct `S3Context` + from the same thread (use a dedicated thread if necessary). This avoids + problems in initializing the dependent Common RunTime C libraries. Please make sure that AWS credentials have been configured on the system. A common way to do this, is to set the environment variables: @@ -51,6 +50,7 @@ def __init__(self, endpoint_override: Optional[str] = None): endpoint_override If not empty, the address of the S3 server. Takes precedences over the `AWS_ENDPOINT_URL` environment variable. + """ self._handle = _get_remote_module().S3Context(endpoint_override) From e489bbe2d0ba91f0ca5c3bb05c937183a938d789 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 14:27:33 +0200 Subject: [PATCH 40/88] doc --- cpp/include/kvikio/remote_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index b340418ee6..6b22e2723f 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -121,7 +121,7 @@ class S3Context { { // NB: `Aws::InitAPI` has to be called before everything in the SDK beside `Aws::SDKOptions`, // even before config structs like `Aws::Client::ClientConfiguration`. - // Notice, we may call `Aws::InitAPI`, which is allowed see: + // Notice, we may call `Aws::InitAPI` multiple times, which is allowed see: // Aws::SDKOptions options; Aws::InitAPI(options); From 73d91046123253b7970c72576396e941e059b3d9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 14:34:22 +0200 Subject: [PATCH 41/88] RemoteHandle: no default ctor --- cpp/include/kvikio/remote_handle.hpp | 3 --- python/kvikio/kvikio/_lib/remote_handle.pyx | 23 ++++++++++----------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 6b22e2723f..5d6d0d6935 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -202,9 +202,6 @@ class RemoteHandle { std::shared_ptr _context; public: - // Use of a default constructed instance is undefined behavior. - RemoteHandle() noexcept = default; - /** * @brief Construct from a bucket and object name pair. * diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index d704d6f8c2..8d9c0a219d 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -6,8 +6,9 @@ from typing import Optional +from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_shared, shared_ptr +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.utility cimport pair @@ -22,7 +23,6 @@ cdef extern from "" nogil: cpp_S3Context(string endpoint_override) except + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": - cpp_RemoteHandle() except + cpp_RemoteHandle( shared_ptr[cpp_S3Context] context, string bucket_name, @@ -55,7 +55,7 @@ cdef class S3Context: self._handle = make_shared[cpp_S3Context](s) cdef class RemoteFile: - cdef cpp_RemoteHandle _handle + cdef unique_ptr[cpp_RemoteHandle] _handle @classmethod def from_bucket_and_object( @@ -65,25 +65,24 @@ cdef class RemoteFile: object_name: str ): cdef RemoteFile ret = RemoteFile() - ret._handle = cpp_RemoteHandle( - context._handle, - str.encode(str(bucket_name)), - str.encode(str(object_name)), - ) + cdef string b = str.encode(str(bucket_name)) + cdef string o = str.encode(str(object_name)) + ret._handle = make_unique[cpp_RemoteHandle](context._handle, b, o) return ret @classmethod def from_url(cls, S3Context context, url: str): cdef RemoteFile ret = RemoteFile() - ret._handle = cpp_RemoteHandle(context._handle, str.encode(str(url))) + cdef string u = str.encode(str(url)) + ret._handle = make_unique[cpp_RemoteHandle](context._handle, u) return ret def nbytes(self) -> int: - return self._handle.nbytes() + return deref(self._handle).nbytes() def read(self, buf, size: Optional[int], file_offset: int) -> int: cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) - return self._handle.read( + return deref(self._handle).read( info.first, info.second, file_offset, @@ -92,7 +91,7 @@ cdef class RemoteFile: def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture: cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) return _wrap_io_future( - self._handle.pread( + deref(self._handle).pread( info.first, info.second, file_offset, From b4ac1d76201a813ab8377d7159176f6d3d5278ad Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 14:41:55 +0200 Subject: [PATCH 42/88] cleanup --- python/kvikio/examples/aws_s3.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/kvikio/examples/aws_s3.py b/python/kvikio/examples/aws_s3.py index 03f1d416b1..0e4292da76 100644 --- a/python/kvikio/examples/aws_s3.py +++ b/python/kvikio/examples/aws_s3.py @@ -15,7 +15,7 @@ def main(): b = cupy.empty_like(a) # In this example, we launch and use a local S3 server with the - # following available address: + # following address: endpoint_url = f"http://127.0.0.1:{get_local_port()}" # In order use a local server instead of an official Amazon S3 server, @@ -35,8 +35,7 @@ def main(): # Using the context, we can open "data" as if it was a regular CuFile with kvikio.RemoteFile(context, "my-bucket", "data") as f: f.read(b) - print(a) - print(b) + assert all(a == b) if __name__ == "__main__": From 146efc052e4df005ed247846959962baf82f878a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 22 Aug 2024 16:35:51 +0200 Subject: [PATCH 43/88] KVIKIO_AWS_SDK_FOUND --- cpp/CMakeLists.txt | 6 ++++++ cpp/include/kvikio/remote_handle.hpp | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 620a85e8a8..cca9f84a48 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -156,6 +156,7 @@ target_link_libraries( ) if(AWSSDK_FOUND) target_link_libraries(kvikio INTERFACE aws-cpp-sdk-s3) + target_compile_definitions(kvikio INTERFACE $) endif() target_compile_features(kvikio INTERFACE cxx_std_17) @@ -243,6 +244,11 @@ if(NOT already_set_kvikio) target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUFILE_STREAM_API_FOUND) endif() endif() + + find_package(AWSSDK COMPONENTS s3 QUIET) + if(AWSSDK_FOUND) + target_compile_definitions(kvikio::kvikio INTERFACE $) + endif() endif() ]=] ) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 5d6d0d6935..5bbe2f1dc7 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -33,6 +33,10 @@ #include #include +#ifndef KVIKIO_AWS_SDK_FOUND +#error "cannot include , configuration did not find AWS SDK" +#endif + namespace kvikio { namespace detail { From b19a146511660da8f99dfce3a3927339fc06a096 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 08:28:05 +0200 Subject: [PATCH 44/88] cmake: --no-s3 --- build.sh | 14 ++++++++++---- cpp/CMakeLists.txt | 17 +++++++++++------ python/kvikio/kvikio/_lib/CMakeLists.txt | 2 +- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/build.sh b/build.sh index adbb5851d2..1343443f4b 100755 --- a/build.sh +++ b/build.sh @@ -18,15 +18,16 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop -h" -HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"\"] [-h] +VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop --no-s3 -h" +HELP="$0 [clean] [libkvikio] [kvikio] [--no-s3] [-v] [-g] [-n] [--pydevelop] [--cmake-args=\"\"] [-h] clean - remove all existing build artifacts and configuration (start over) libkvikio - build and install the libkvikio C++ code kvikio - build and install the kvikio Python package + --no-s3 - build with no AWS S3 support -v - verbose build mode -g - build for debug -n - no install step - --pydevelop - Install Python packages in editable mode + --pydevelop - install Python packages in editable mode --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) -h - print this text default action (no args) is to build and install 'libkvikio' and 'kvikio' targets @@ -36,6 +37,7 @@ KVIKIO_BUILD_DIR="${REPODIR}/python/build ${REPODIR}/python/_skbuild" BUILD_DIRS="${LIBKVIKIO_BUILD_DIR} ${KVIKIO_BUILD_DIR}" # Set defaults for vars modified by flags to this script +ENABLE_S3_SUPPORT="-DKvikIO_AWSSDK_SUPPORT=ON" VERBOSE_FLAG="" BUILD_TYPE=Release INSTALL_TARGET=install @@ -86,6 +88,7 @@ function ensureCMakeRan { cmake -B "${LIBKVIKIO_BUILD_DIR}" -S . \ -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + ${ENABLE_S3_SUPPORT} \ ${EXTRA_CMAKE_ARGS} RAN_CMAKE=1 fi @@ -109,6 +112,9 @@ if (( ${NUMARGS} != 0 )); then fi # Process flags +if hasArg --no-s3; then + ENABLE_S3_SUPPORT="-DKvikIO_AWSSDK_SUPPORT=OFF" +fi if hasArg -v; then VERBOSE_FLAG=-v set -x @@ -150,7 +156,7 @@ if (( NUMARGS == 0 )) || hasArg libkvikio; then cmake --build "${LIBKVIKIO_BUILD_DIR}" -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} if [[ ${INSTALL_TARGET} != "" ]]; then echo "installing libkvikio..." - cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install -v ${VERBOSE_FLAG} + cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install ${VERBOSE_FLAG} fi fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cca9f84a48..2634becc82 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,6 +37,7 @@ rapids_cmake_build_type(Release) # build options option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON) +option(KvikIO_AWSSDK_SUPPORT "Configure CMake to build with AWS S3 support" ON) rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) @@ -57,11 +58,13 @@ rapids_find_package( # If AWSSDK isn't found, the Cython module remote_handle.pyx isn't built and C++ users shouldn't # include -rapids_find_package( - AWSSDK COMPONENTS s3 - BUILD_EXPORT_SET kvikio-exports - INSTALL_EXPORT_SET kvikio-exports -) +if(KvikIO_AWSSDK_SUPPORT) + rapids_find_package( + AWSSDK REQUIRED COMPONENTS s3 + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports + ) +endif() if(AWSSDK_FOUND) get_property( @@ -245,7 +248,9 @@ if(NOT already_set_kvikio) endif() endif() - find_package(AWSSDK COMPONENTS s3 QUIET) + if(KvikIO_AWSSDK_SUPPORT) + find_package(AWSSDK COMPONENTS s3 QUIET) + endif() if(AWSSDK_FOUND) target_compile_definitions(kvikio::kvikio INTERFACE $) endif() diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index 9237e37116..97d068cb10 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -21,7 +21,7 @@ if(AWSSDK_FOUND) message(STATUS "Building remote_handle.pyx (aws-cpp-sdk-s3 found)") list(APPEND cython_modules remote_handle.pyx) else() - message(WARNING "Skipping remote_handle.pyx (aws-cpp-sdk-s3 not found)") + message(WARNING "Skipping remote_handle.pyx (aws-cpp-sdk-s3 not found or disabled)") endif() rapids_cython_create_modules( From 4cb54a97970e23142aa834bda734ecca99e8d246 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 08:36:04 +0200 Subject: [PATCH 45/88] move KVIKIO_AWS_SDK_FOUND up before aws include --- cpp/include/kvikio/remote_handle.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 5bbe2f1dc7..c9a1129350 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -15,6 +15,10 @@ */ #pragma once +#ifndef KVIKIO_AWS_SDK_FOUND +#error "cannot include , configuration did not find AWS SDK" +#endif + #include #include #include @@ -22,6 +26,10 @@ #include #include +#include +#include +#include + #include #include #include @@ -29,14 +37,6 @@ #include #include -#include -#include -#include - -#ifndef KVIKIO_AWS_SDK_FOUND -#error "cannot include , configuration did not find AWS SDK" -#endif - namespace kvikio { namespace detail { From 379f5211ed0021997419d9bd535789328f0be5d5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 08:59:15 +0200 Subject: [PATCH 46/88] parse_s3_path tests --- cpp/include/kvikio/remote_handle.hpp | 13 ++++++++++--- python/kvikio/tests/test_aws_s3.py | 24 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index c9a1129350..57a78cc796 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -65,9 +65,16 @@ inline std::pair parse_s3_path(const std::string& path } std::string p = path.substr(5); if (p.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } - size_t pos = p.find_first_of('/'); - if (pos == 0) { throw std::invalid_argument("The remote path does not contain a bucket name."); } - return std::make_pair(p.substr(0, pos), (pos == std::string::npos) ? "" : p.substr(pos + 1)); + size_t pos = p.find_first_of('/'); + std::string bucket_name = p.substr(0, pos); + if (bucket_name.empty()) { + throw std::invalid_argument("The remote path does not contain a bucket name."); + } + std::string object_name = (pos == std::string::npos) ? "" : p.substr(pos + 1); + if (object_name.empty()) { + throw std::invalid_argument("The remote path does not contain an object name."); + } + return std::make_pair(std::move(bucket_name), std::move(object_name)); } } // namespace detail diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index a4569b7601..66e633b2ce 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -122,7 +122,7 @@ def test_read(s3_base, xp): ], ) def test_read_with_file_offset(s3_base, xp, start, end): - bucket_name = "test_read" + bucket_name = "test_read_with_file_offset" object_name = "a1" a = xp.arange(end, dtype=xp.int64) with s3_context( @@ -137,3 +137,25 @@ def test_read_with_file_offset(s3_base, xp, start, end): b = xp.zeros(shape=(end - start,), dtype=xp.int64) assert f.read(b, file_offset=start * a.itemsize) == b.nbytes xp.testing.assert_array_equal(a[start:end], b) + + +def test_remote_path_error(s3_base): + bucket_name = "test_remote_path_error" + with s3_context(s3_base=s3_base, bucket=bucket_name) as ctx: + with pytest.raises(ValueError, match="No response body"): + kvikio.RemoteFile.from_url(ctx, "s3://unknown-bucket/unknown-object") + + with pytest.raises(ValueError, match="No response body"): + kvikio.RemoteFile.from_url(ctx, f"s3://{bucket_name}/unknown-object") + + with pytest.raises(ValueError, match="path must start with the S3 scheme"): + kvikio.RemoteFile.from_url(ctx, f"s3:/{bucket_name}/") + + with pytest.raises(ValueError, match="path does not contain a bucket name"): + kvikio.RemoteFile.from_url(ctx, "s3:///unknown-object") + + with pytest.raises(ValueError, match="path does not contain an object name"): + kvikio.RemoteFile.from_url(ctx, f"s3://{bucket_name}/") + + with pytest.raises(ValueError, match="path does not contain an object name"): + kvikio.RemoteFile.from_url(ctx, f"s3://{bucket_name}") From 8849d11482050ac64a0cc9200082d52e6c064121 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 09:14:21 +0200 Subject: [PATCH 47/88] S3Context::client(): mark const --- cpp/include/kvikio/remote_handle.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 57a78cc796..3f8ad6946b 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -54,6 +54,8 @@ class BufferAsStream : public Aws::IOStream { /** * @brief Given a file path like "s3:///", return the name of the bucket and object. * + * @throws std::invalid_argument if file path is ill-formed or is missing the bucket or object name. + * * @param path S3 file path. * @return Pair of strings: [bucket-name, object-name]. */ @@ -93,7 +95,7 @@ inline std::pair parse_s3_path(const std::string& path */ class S3Context { private: - // We use a shared point since constructing a default `Aws::S3::S3Client` before calling + // We use a shared pointer since constructing a default `Aws::S3::S3Client` before calling // `Aws::InitAPI` is illegal. std::shared_ptr _client; // Only call `Aws::ShutdownAPI`, if `Aws::InitAPI` was called on construction. @@ -125,6 +127,8 @@ class S3Context { * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see * . * + * @throws std::runtime_error if failed authentication to the S3 server. + * * @param endpoint_override If not empty, the address of the S3 server. Takes precedences * over the `AWS_ENDPOINT_URL` environment variable. */ @@ -171,7 +175,7 @@ class S3Context { * * @return S3 client. */ - Aws::S3::S3Client& client() { return *_client; } + Aws::S3::S3Client const& client() const { return *_client; } // No copy and move semantic S3Context(S3Context const&) = delete; @@ -186,7 +190,7 @@ class S3Context { * @param object_name The object name. * @return Size of the file in bytes. */ - std::size_t get_file_size(const std::string& bucket_name, const std::string& object_name) + std::size_t get_file_size(const std::string& bucket_name, const std::string& object_name) const { KVIKIO_NVTX_FUNC_RANGE(); Aws::S3::Model::HeadObjectRequest req; From 8c54a6c9429bf94a33b857b42d977560269a7025 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 09:22:03 +0200 Subject: [PATCH 48/88] east const --- cpp/include/kvikio/remote_handle.hpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 3f8ad6946b..93ee2ea39a 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -59,7 +59,7 @@ class BufferAsStream : public Aws::IOStream { * @param path S3 file path. * @return Pair of strings: [bucket-name, object-name]. */ -inline std::pair parse_s3_path(const std::string& path) +inline std::pair parse_s3_path(std::string const& path) { if (path.empty()) { throw std::invalid_argument("The remote path cannot be an empty string."); } if (path.size() < 5 || path.substr(0, 5) != "s3://") { @@ -99,7 +99,7 @@ class S3Context { // `Aws::InitAPI` is illegal. std::shared_ptr _client; // Only call `Aws::ShutdownAPI`, if `Aws::InitAPI` was called on construction. - const bool _shutdown_s3_api; + bool const _shutdown_s3_api; public: /** @@ -132,7 +132,7 @@ class S3Context { * @param endpoint_override If not empty, the address of the S3 server. Takes precedences * over the `AWS_ENDPOINT_URL` environment variable. */ - S3Context(const std::string& endpoint_override = "") : _shutdown_s3_api{true} + S3Context(std::string const& endpoint_override = "") : _shutdown_s3_api{true} { // NB: `Aws::InitAPI` has to be called before everything in the SDK beside `Aws::SDKOptions`, // even before config structs like `Aws::Client::ClientConfiguration`. @@ -143,7 +143,7 @@ class S3Context { // Create a client config where `endpoint_override` takes precedences over `AWS_ENDPOINT_URL` Aws::Client::ClientConfiguration config; - const char* ep = std::getenv("AWS_ENDPOINT_URL"); + char const* ep = std::getenv("AWS_ENDPOINT_URL"); if (!endpoint_override.empty()) { config.endpointOverride = endpoint_override; } else if (ep != nullptr && !std::string(ep).empty()) { @@ -164,7 +164,7 @@ class S3Context { try { Aws::SDKOptions options; Aws::ShutdownAPI(options); - } catch (const std::exception& e) { + } catch (std::exception const& e) { std::cerr << "~S3Context(): " << e.what() << std::endl; } } @@ -190,7 +190,7 @@ class S3Context { * @param object_name The object name. * @return Size of the file in bytes. */ - std::size_t get_file_size(const std::string& bucket_name, const std::string& object_name) const + std::size_t get_file_size(std::string const& bucket_name, std::string const& object_name) const { KVIKIO_NVTX_FUNC_RANGE(); Aws::S3::Model::HeadObjectRequest req; @@ -198,7 +198,7 @@ class S3Context { req.SetKey(object_name.c_str()); Aws::S3::Model::HeadObjectOutcome outcome = client().HeadObject(req); if (!outcome.IsSuccess()) { - const Aws::S3::S3Error& err = outcome.GetError(); + Aws::S3::S3Error const& err = outcome.GetError(); throw std::invalid_argument("get_file_size(): " + err.GetExceptionName() + ": " + err.GetMessage()); } @@ -252,7 +252,7 @@ class RemoteHandle { * @param context The S3 context used for the connection to the remove server. * @param remote_path Remote file path. */ - RemoteHandle(std::shared_ptr context, const std::string& remote_path) + RemoteHandle(std::shared_ptr context, std::string const& remote_path) : RemoteHandle(std::move(context), detail::parse_s3_path(remote_path)) { } @@ -281,7 +281,7 @@ class RemoteHandle { Aws::S3::Model::GetObjectRequest req; req.SetBucket(_bucket_name.c_str()); req.SetKey(_object_name.c_str()); - const std::string byte_range = + std::string const byte_range = "bytes=" + std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); req.SetRange(byte_range.c_str()); @@ -294,10 +294,10 @@ class RemoteHandle { Aws::S3::Model::GetObjectOutcome outcome = _context->client().GetObject(req); if (!outcome.IsSuccess()) { - const Aws::S3::S3Error& err = outcome.GetError(); + Aws::S3::S3Error const& err = outcome.GetError(); throw std::runtime_error(err.GetExceptionName() + ": " + err.GetMessage()); } - const std::size_t n = outcome.GetResult().GetContentLength(); + std::size_t const n = outcome.GetResult().GetContentLength(); if (n != size) { throw std::runtime_error("S3 read of " + std::to_string(size) + " bytes failed, received " + std::to_string(n) + " bytes"); @@ -329,7 +329,7 @@ class RemoteHandle { std::size_t byte_remaining = convert_size2off(size); while (byte_remaining > 0) { - const std::size_t nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); + std::size_t const nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); std::size_t nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream)); CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); From bb0ad242b14e728e7f014e4ab3e5b12b3dfda90c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 09:34:18 +0200 Subject: [PATCH 49/88] typo --- cpp/include/kvikio/remote_handle.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 93ee2ea39a..7235f633e4 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -220,7 +220,7 @@ class RemoteHandle { /** * @brief Construct from a bucket and object name pair. * - * @param context The S3 context used for the connection to the remove server. + * @param context The S3 context used for the connection to the remote server. * @param bucket_and_object_name Name pair . */ RemoteHandle(std::shared_ptr context, @@ -236,7 +236,7 @@ class RemoteHandle { /** * @brief Construct from a bucket and object name. * - * @param context The S3 context used for the connection to the remove server. + * @param context The S3 context used for the connection to the remote server. * @param bucket_name Name of the bucket. * @param object_name Name of the object. */ @@ -249,7 +249,7 @@ class RemoteHandle { /** * @brief Construct from a remote path such as "s3:///". * - * @param context The S3 context used for the connection to the remove server. + * @param context The S3 context used for the connection to the remote server. * @param remote_path Remote file path. */ RemoteHandle(std::shared_ptr context, std::string const& remote_path) From 2e5ff0b0f8efab82e97e4d0fcd3afd64825d6441 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 10:00:57 +0200 Subject: [PATCH 50/88] aws-sdk-cpp>=1.11.267, we need --- .../environments/all_cuda-118_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- .../environments/all_cuda-125_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/kvikio/meta.yaml | 6 +++--- conda/recipes/libkvikio/meta.yaml | 16 ++++++++-------- cpp/include/kvikio/remote_handle.hpp | 5 ++--- dependencies.yaml | 2 +- 8 files changed, 18 insertions(+), 19 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 893801d935..1c4896f4bb 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -6,7 +6,7 @@ channels: - conda-forge - nvidia dependencies: -- aws-sdk-cpp +- aws-sdk-cpp>=1.11.267 - boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a76591d52e..91a1c9c94f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -6,7 +6,7 @@ channels: - conda-forge - nvidia dependencies: -- aws-sdk-cpp +- aws-sdk-cpp>=1.11.267 - boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 2bf62a49ad..a1e4b92bd5 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -6,7 +6,7 @@ channels: - conda-forge - nvidia dependencies: -- aws-sdk-cpp +- aws-sdk-cpp>=1.11.267 - boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 4818fea230..fa3fe3556d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -6,7 +6,7 @@ channels: - conda-forge - nvidia dependencies: -- aws-sdk-cpp +- aws-sdk-cpp>=1.11.267 - boto3>=1.21.21 - c-compiler - cmake>=3.26.4,!=3.30.0 diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml index 605734b369..41eb27c4f9 100644 --- a/conda/recipes/kvikio/meta.yaml +++ b/conda/recipes/kvikio/meta.yaml @@ -52,7 +52,7 @@ requirements: - {{ compiler('cuda') }} {% endif %} - {{ stdlib("c") }} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 host: - python - pip @@ -65,13 +65,13 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - libkvikio ={{ version }} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 run: - python - numpy >=1.23,<3.0a0 - cupy >=12.0.0 - zarr - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 # See https://github.com/zarr-developers/numcodecs/pull/475 - numcodecs <0.12.0 - packaging diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 0c97e01abe..b4875e30e1 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -43,7 +43,7 @@ requirements: {% endif %} - ninja - {{ stdlib("c") }} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 host: - cuda-version ={{ cuda_version }} {% if cuda_major == "11" %} @@ -53,7 +53,7 @@ requirements: {% else %} - libcufile-dev # [linux] {% endif %} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 outputs: - name: libkvikio @@ -74,10 +74,10 @@ outputs: requirements: build: - cmake {{ cmake_version }} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 host: - cuda-version ={{ cuda_version }} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} @@ -87,7 +87,7 @@ outputs: {% else %} - libcufile-dev # [linux] {% endif %} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 test: commands: - test -f $PREFIX/include/kvikio/file_handle.hpp @@ -111,7 +111,7 @@ outputs: - cuda-cudart-dev - libcufile-dev # [linux] {% endif %} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 requirements: build: - cmake {{ cmake_version }} @@ -124,7 +124,7 @@ outputs: - cuda-cudart-dev - libcufile-dev # [linux] {% endif %} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} @@ -134,7 +134,7 @@ outputs: - cuda-cudart - libcufile # [linux] {% endif %} - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 about: home: https://rapids.ai license: Apache-2.0 diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 7235f633e4..67159a88f7 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -135,9 +135,8 @@ class S3Context { S3Context(std::string const& endpoint_override = "") : _shutdown_s3_api{true} { // NB: `Aws::InitAPI` has to be called before everything in the SDK beside `Aws::SDKOptions`, - // even before config structs like `Aws::Client::ClientConfiguration`. - // Notice, we may call `Aws::InitAPI` multiple times, which is allowed see: - // + // even before config structs like `Aws::Client::ClientConfiguration`. However, we are now + // allowed to call `Aws::InitAPI` multiple times: Aws::SDKOptions options; Aws::InitAPI(options); diff --git a/dependencies.yaml b/dependencies.yaml index 4b3ba475b8..c3268c2b60 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -106,7 +106,7 @@ dependencies: packages: - c-compiler - cxx-compiler - - aws-sdk-cpp + - aws-sdk-cpp>=1.11.267 # Need specific: - output_types: conda matrices: From 378e9b6853fcaf18b5ecf9292bb732af87590063 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 10:05:50 +0200 Subject: [PATCH 51/88] doc --- cpp/include/kvikio/remote_handle.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 67159a88f7..b14eaa5b4c 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -127,10 +127,10 @@ class S3Context { * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see * . * - * @throws std::runtime_error if failed authentication to the S3 server. + * @throws std::runtime_error If failing to authenticate to the S3 server. * - * @param endpoint_override If not empty, the address of the S3 server. Takes precedences - * over the `AWS_ENDPOINT_URL` environment variable. + * @param endpoint_override If not empty, the address of the S3 server. This takes precedences + * over the AWS system configuration including the `AWS_ENDPOINT_URL` environment variable. */ S3Context(std::string const& endpoint_override = "") : _shutdown_s3_api{true} { From 496fe8d5b0bb96113e4906065f24b02ea8e24b60 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 10:24:24 +0200 Subject: [PATCH 52/88] doc --- python/kvikio/kvikio/remote_file.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index b950da2b32..25079afdeb 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -48,9 +48,9 @@ def __init__(self, endpoint_override: Optional[str] = None): Parameters ---------- endpoint_override - If not empty, the address of the S3 server. Takes precedences over the - `AWS_ENDPOINT_URL` environment variable. - + If not empty, the address of the S3 server. This takes precedences over + the AWS system configuration including the `AWS_ENDPOINT_URL` environment + variable. """ self._handle = _get_remote_module().S3Context(endpoint_override) From ea2d93ca7d706ac88334c9dfe6ece66c07efd42c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 23 Aug 2024 15:57:43 +0200 Subject: [PATCH 53/88] CI: disable devcontainer until #446 has been fixed. --- .github/workflows/pr.yaml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 4499514060..f978d97b7a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -61,16 +61,17 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" - devcontainer: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 - with: - arch: '["amd64"]' - cuda: '["12.5"]' - build_command: | - sccache -z; - build-all --verbose; - sccache -s; + # TODO: uncomment when https://github.com/rapidsai/kvikio/issues/446 has been fixed. + # devcontainer: + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 + # with: + # arch: '["amd64"]' + # cuda: '["12.5"]' + # build_command: | + # sccache -z; + # build-all --verbose; + # sccache -s; wheel-cpp-build: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 From 3bb3ce11aee652b734f041e0fb19ecd1fcba2102 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 26 Aug 2024 08:01:58 +0200 Subject: [PATCH 54/88] yaml: remove some aws-sdk-cpp --- conda/recipes/kvikio/meta.yaml | 1 - conda/recipes/libkvikio/meta.yaml | 3 --- 2 files changed, 4 deletions(-) diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml index 41eb27c4f9..14162dd270 100644 --- a/conda/recipes/kvikio/meta.yaml +++ b/conda/recipes/kvikio/meta.yaml @@ -52,7 +52,6 @@ requirements: - {{ compiler('cuda') }} {% endif %} - {{ stdlib("c") }} - - aws-sdk-cpp>=1.11.267 host: - python - pip diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index b4875e30e1..1ea46a59e4 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -43,7 +43,6 @@ requirements: {% endif %} - ninja - {{ stdlib("c") }} - - aws-sdk-cpp>=1.11.267 host: - cuda-version ={{ cuda_version }} {% if cuda_major == "11" %} @@ -74,10 +73,8 @@ outputs: requirements: build: - cmake {{ cmake_version }} - - aws-sdk-cpp>=1.11.267 host: - cuda-version ={{ cuda_version }} - - aws-sdk-cpp>=1.11.267 run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} From b9b2c44b5a1883ef7e61843aa211fe1840f8bba8 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 26 Aug 2024 08:06:02 +0200 Subject: [PATCH 55/88] ci: comment out devcontainer --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f978d97b7a..17561466c1 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,7 +18,7 @@ jobs: - conda-python-build - conda-python-tests - docs-build - - devcontainer + # - devcontainer - wheel-cpp-build - wheel-python-build - wheel-python-tests From c483b686cabc512c86328a46015797452942b4fd Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 27 Aug 2024 16:59:24 +0200 Subject: [PATCH 56/88] AllocRetain::ensure_alloc_size() --- cpp/include/kvikio/bounce_buffer.hpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp index 77913afc5a..b47115faaa 100644 --- a/cpp/include/kvikio/bounce_buffer.hpp +++ b/cpp/include/kvikio/bounce_buffer.hpp @@ -70,6 +70,11 @@ class AllocRetain { } } + /** + * @brief Free all retained allocations + * + * NB: The `_mutex` must be taken prior to calling this function, if not called from the dtor. + */ void clear() { while (!_free_allocs.empty()) { @@ -78,12 +83,24 @@ class AllocRetain { } } - [[nodiscard]] Alloc get() + /** + * @brief Ensure the size of the retained allocations match `defaults::bounce_buffer_size()` + * + * NB: `_mutex` must be taken prior to calling this function. + */ + void ensure_alloc_size() { - const std::lock_guard lock(_mutex); - if (_size != defaults::bounce_buffer_size()) { + const auto bounce_buffer_size = defaults::bounce_buffer_size(); + if (_size != bounce_buffer_size) { + _size = bounce_buffer_size; clear(); // the desired allocation size has changed. } + } + + [[nodiscard]] Alloc get() + { + const std::lock_guard lock(_mutex); + ensure_alloc_size(); // Check if we have an allocation available if (!_free_allocs.empty()) { @@ -102,9 +119,7 @@ class AllocRetain { void put(void* alloc, std::size_t size) { const std::lock_guard lock(_mutex); - if (_size != defaults::bounce_buffer_size()) { - clear(); // the desired allocation size has changed. - } + ensure_alloc_size(); // If the size of `alloc` matches the sizes of the retained allocations, // it is added to the set of free allocation otherwise it is freed. From ce0c59ad87969f5debd2dacfb0c0fcef86de683e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 27 Aug 2024 17:02:39 +0200 Subject: [PATCH 57/88] add task_size argument --- cpp/include/kvikio/remote_handle.hpp | 16 ++++++++++------ python/kvikio/tests/test_aws_s3.py | 24 ++++++++++++++++-------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index b14eaa5b4c..cce5fff80d 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -320,7 +320,7 @@ class RemoteHandle { CUcontext ctx = get_context_from_pointer(buf); PushAndPopContext c(ctx); - auto alloc = detail::AllocRetain::instance().get(); // Host memory allocation + auto alloc = AllocRetain::instance().get(); // Host memory allocation CUdeviceptr devPtr = convert_void2deviceptr(buf); CUstream stream = detail::StreamsByThread::get(); @@ -328,7 +328,7 @@ class RemoteHandle { std::size_t byte_remaining = convert_size2off(size); while (byte_remaining > 0) { - std::size_t const nbytes_requested = std::min(posix_bounce_buffer_size, byte_remaining); + std::size_t const nbytes_requested = std::min(alloc.size(), byte_remaining); std::size_t nbytes_got = read_to_host(alloc.get(), nbytes_requested, cur_file_offset); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream)); CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); @@ -342,15 +342,19 @@ class RemoteHandle { /** * @brief Read from remote source into buffer (host or device memory) in parallel. * - * Contrary to `FileHandle::pread()`, a task size of 16 MiB is used always. - * See `kvikio::posix_bounce_buffer_size`. + * This API is a parallel async version of `.read()` that partition the operation + * into tasks of size `task_size` for execution in the default thread pool. * * @param buf Pointer to host or device memory. * @param size Number of bytes to read. * @param file_offset File offset in bytes. + * @param task_size Size of each task in bytes. * @return Number of bytes read, which is `size` always. */ - std::future pread(void* buf, std::size_t size, std::size_t file_offset = 0) + std::future pread(void* buf, + std::size_t size, + std::size_t file_offset = 0, + std::size_t task_size = defaults::task_size()) { KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::pread()", size); auto task = [this](void* devPtr_base, @@ -359,7 +363,7 @@ class RemoteHandle { std::size_t devPtr_offset) -> std::size_t { return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); }; - return parallel_io(task, buf, size, file_offset, posix_bounce_buffer_size, 0); + return parallel_io(task, buf, size, file_offset, task_size, 0); } }; diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 66e633b2ce..4054616f33 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -10,6 +10,7 @@ import pytest import kvikio +import kvikio.defaults # TODO: remove before PR merge. Trigger CI error if the remote module wasn't built import kvikio._lib.remote_handle # isort: skip @@ -98,18 +99,25 @@ def s3_context(s3_base, bucket, files=None): pass -def test_read(s3_base, xp): +@pytest.mark.parametrize("size", [10, 100, 1000]) +@pytest.mark.parametrize("nthreads", [1, 3]) +@pytest.mark.parametrize("tasksize", [99, 999]) +@pytest.mark.parametrize("buffer_size", [101, 1001]) +def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size): bucket_name = "test_read" object_name = "a1" - a = xp.arange(10_000_000) + a = xp.arange(size) with s3_context( s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)} ) as ctx: - with kvikio.RemoteFile(ctx, bucket_name, object_name) as f: - assert f.nbytes() == a.nbytes - b = xp.empty_like(a) - assert f.read(buf=b) == a.nbytes - xp.testing.assert_array_equal(a, b) + with kvikio.defaults.set_num_threads(nthreads): + with kvikio.defaults.set_task_size(tasksize): + with kvikio.defaults.set_bounce_buffer_size(buffer_size): + with kvikio.RemoteFile(ctx, bucket_name, object_name) as f: + assert f.nbytes() == a.nbytes + b = xp.empty_like(a) + assert f.read(buf=b) == a.nbytes + xp.testing.assert_array_equal(a, b) @pytest.mark.parametrize( @@ -118,7 +126,7 @@ def test_read(s3_base, xp): (0, 10 * 4096), (1, int(1.3 * 4096)), (int(2.1 * 4096), int(5.6 * 4096)), - (42, int(2**23)), + (42, int(2**20)), ], ) def test_read_with_file_offset(s3_base, xp, start, end): From bbb40424182e1b3f83dafd6f5dfc1719b0494362 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 08:42:38 +0200 Subject: [PATCH 58/88] harmonic_mean --- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 6 +++++- python/kvikio/kvikio/benchmarks/single_node_io.py | 2 +- python/kvikio/kvikio/benchmarks/zarr_io.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index 9d86aa5f8e..015854dcca 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -32,6 +32,9 @@ def get_local_port() -> int: def start_s3_server(lifetime: int): + """Start a server and run it for `lifetime` minutes. + NB: to stop before `lifetime`, kill the process/thread running this function. + """ from moto.server import ThreadedMotoServer # Silence the activity info from ThreadedMotoServer @@ -44,6 +47,7 @@ def start_s3_server(lifetime: int): @contextlib.contextmanager def local_s3_server(lifetime: int): + """Start a server and run it for `lifetime` minutes or kill it on context exit""" # Use fake aws credentials os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" @@ -155,7 +159,7 @@ def main(args): def pprint_api_res(name, samples): samples = [args.nbytes / s for s in samples] # Convert to throughput - mean = statistics.mean(samples) if len(samples) > 1 else samples[0] + mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0] ret = f"{api}-{name}".ljust(18) ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14) if len(samples) > 1: diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py index 4d47a80791..3ff4acf9c7 100644 --- a/python/kvikio/kvikio/benchmarks/single_node_io.py +++ b/python/kvikio/kvikio/benchmarks/single_node_io.py @@ -284,7 +284,7 @@ def main(args): ws.append(args.nbytes / write) def pprint_api_res(name, samples): - mean = statistics.mean(samples) if len(samples) > 1 else samples[0] + mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0] ret = f"{api} {name}".ljust(18) ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14) if len(samples) > 1: diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py index fc226c2263..3ebeeea707 100644 --- a/python/kvikio/kvikio/benchmarks/zarr_io.py +++ b/python/kvikio/kvikio/benchmarks/zarr_io.py @@ -150,7 +150,7 @@ def main(args): ws.append(args.nbytes / write) def pprint_api_res(name, samples): - mean = statistics.mean(samples) if len(samples) > 1 else samples[0] + mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0] ret = f"{api} {name}".ljust(18) ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14) if len(samples) > 1: From f363f15b7059fc782112c3e7b51163dc35b5ca31 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 08:54:36 +0200 Subject: [PATCH 59/88] use MonkeyPatch --- python/kvikio/kvikio/remote_file.py | 3 ++ python/kvikio/tests/test_aws_s3.py | 53 +++++++++-------------------- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 25079afdeb..9779030ce5 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -3,11 +3,13 @@ from __future__ import annotations +import functools from typing import Optional from kvikio.cufile import IOFuture +@functools.cache def is_remote_file_available() -> bool: """Check if the remote module is available""" try: @@ -18,6 +20,7 @@ def is_remote_file_available() -> bool: return True +@functools.cache def _get_remote_module(): """Get the remote module or raise an error""" if not is_remote_file_available(): diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 4054616f33..0498796255 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -2,7 +2,6 @@ # See file LICENSE for terms. import multiprocessing as mp -import os import socket import time from contextlib import contextmanager @@ -41,21 +40,6 @@ def endpoint_port(): return port -@contextmanager -def ensure_safe_environment_variables(): - """ - Get a context manager to safely set environment variables - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. - """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) - - def start_s3_server(ip_address, port): server = moto.server.ThreadedMotoServer(ip_address=ip_address, port=port) server.start() @@ -65,16 +49,14 @@ def start_s3_server(ip_address, port): @pytest.fixture(scope="session") def s3_base(endpoint_ip, endpoint_port): - """ - Fixture to set up moto server in separate process - """ - with ensure_safe_environment_variables(): + """Fixture to set up moto server in separate process""" + with pytest.MonkeyPatch.context() as monkeypatch: # Use fake aws credentials - os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" - os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" - os.environ["AWS_SECURITY_TOKEN"] = "foobar_security_token" - os.environ["AWS_SESSION_TOKEN"] = "foobar_session_token" - os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeypatch.setenv("AWS_SECURITY_TOKEN", "foobar_security_token") + monkeypatch.setenv("AWS_SESSION_TOKEN", "foobar_session_token") + monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") p = mp.Process(target=start_s3_server, args=(endpoint_ip, endpoint_port)) p.start() @@ -86,17 +68,16 @@ def s3_base(endpoint_ip, endpoint_port): def s3_context(s3_base, bucket, files=None): if files is None: files = {} - with ensure_safe_environment_variables(): - client = boto3.client("s3", endpoint_url=s3_base) - client.create_bucket(Bucket=bucket, ACL="public-read-write") - for f, data in files.items(): - client.put_object(Bucket=bucket, Key=f, Body=data) - yield kvikio.S3Context(s3_base) - for f, data in files.items(): - try: - client.delete_object(Bucket=bucket, Key=f) - except Exception: - pass + client = boto3.client("s3", endpoint_url=s3_base) + client.create_bucket(Bucket=bucket, ACL="public-read-write") + for f, data in files.items(): + client.put_object(Bucket=bucket, Key=f, Body=data) + yield kvikio.S3Context(s3_base) + for f, data in files.items(): + try: + client.delete_object(Bucket=bucket, Key=f) + except Exception: + pass @pytest.mark.parametrize("size", [10, 100, 1000]) From 8129d1719bb5d38dd88fddc0e83b7166e38e9f27 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 08:56:14 +0200 Subject: [PATCH 60/88] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- README.md | 2 +- cpp/include/kvikio/remote_handle.hpp | 5 +++-- python/kvikio/tests/test_examples.py | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f4807f1b90..aeb6ddfa46 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The C++ library is header-only making it easy to include in [existing projects]( * Concurrent reads and writes using an internal thread pool. * Non-blocking API. * Read/write to both host and device memory seamlessly. -* Provides remote read from AWS S3 storage seamlessly. +* Provides compile-time optional remote read from AWS S3 storage seamlessly, using the [AWS SDK](https://docs.aws.amazon.com/sdkref/latest/guide/overview.html). * Provides Python bindings to [nvCOMP](https://github.com/NVIDIA/nvcomp). diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index cce5fff80d..d12616fdc9 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -82,7 +82,7 @@ inline std::pair parse_s3_path(std::string const& path } // namespace detail /** - * @brief S3 context, which initializes and maintains the S3 SDK and client. + * @brief S3 context that initializes and maintains the S3 SDK and client. * * If not given an existing S3 client, S3Context calls `Aws::InitAPI()` and `Aws::ShutdownAPI`, * which inherit some limitations from the SDK. @@ -121,7 +121,7 @@ class S3Context { * The S3 SDK is automatically initialized on construction and shutdown on destruction. * * The new S3 client use the default `Aws::Client::ClientConfiguration`, thus please make sure - * that AWS credentials have been configure on the system. A common way to do this, is to set the + * that AWS credentials have been configured on the system. A common way to do this, is to set the * environment variables: `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. * * Other relevant options are `AWS_DEFAULT_REGION` and `AWS_ENDPOINT_URL`, see @@ -160,6 +160,7 @@ class S3Context { ~S3Context() noexcept { if (_shutdown_s3_api) { + _client = nullptr; try { Aws::SDKOptions options; Aws::ShutdownAPI(options); diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py index 8956ba7bd5..e30c8874ab 100644 --- a/python/kvikio/tests/test_examples.py +++ b/python/kvikio/tests/test_examples.py @@ -30,13 +30,13 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch): import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file") +@pytest.mark.skipif( + not kvikio.is_remote_file_available(), + reason="KvikIO not built with AWS S3 support", +) def test_aws_s3(monkeypatch): """Test examples/aws_s3.py""" - if not kvikio.is_remote_file_available(): - pytest.skip( - "cannot test remote IO, please build KvikIO with with AWS S3 support" - ) # Fail early if dependencies isn't available import boto3 # noqa: F401 import moto # noqa: F401 From fb95490e0c590aedfe35d99b277e4f59bfbb0a49 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 09:12:03 +0200 Subject: [PATCH 61/88] doc --- cpp/include/kvikio/remote_handle.hpp | 9 ++++++++- python/kvikio/tests/test_examples.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index d12616fdc9..b91bff05ec 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -160,7 +160,14 @@ class S3Context { ~S3Context() noexcept { if (_shutdown_s3_api) { - _client = nullptr; + // Since we created the S3 client and we only provide const reference access, + // we should be the only reference. + if (_client.use_count() != 1) { + std::cerr << "~S3Context(): S3 client has multiple owners, cannot shutdown the AWS API" + << std::endl; + return; + } + _client = nullptr; // Close the client before shutting down the API try { Aws::SDKOptions options; Aws::ShutdownAPI(options); diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py index e30c8874ab..4ec44fbc2c 100644 --- a/python/kvikio/tests/test_examples.py +++ b/python/kvikio/tests/test_examples.py @@ -31,7 +31,7 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch): @pytest.mark.skipif( - not kvikio.is_remote_file_available(), + not kvikio.is_remote_file_available(), reason="KvikIO not built with AWS S3 support", ) def test_aws_s3(monkeypatch): From 2729dbf16588b80c339cd41d5c03ff64cfb11a09 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 09:34:03 +0200 Subject: [PATCH 62/88] test: 10mins timeout --- python/kvikio/tests/test_aws_s3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 0498796255..7d7f15ee24 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -43,8 +43,8 @@ def endpoint_port(): def start_s3_server(ip_address, port): server = moto.server.ThreadedMotoServer(ip_address=ip_address, port=port) server.start() - time.sleep(180) - print("ThreadedMotoServer shutting down because of timeout (180s)") + time.sleep(600) + print("ThreadedMotoServer shutting down because of timeout (10min)") @pytest.fixture(scope="session") From ce08365a58b3b8cfceb8d05851a8c25e936b64e4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 28 Aug 2024 09:42:05 +0200 Subject: [PATCH 63/88] doc --- cpp/doxygen/main_page.md | 14 ++++++++++++++ docs/source/runtime_settings.rst | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md index 2b404f835e..af0282c6d0 100644 --- a/cpp/doxygen/main_page.md +++ b/cpp/doxygen/main_page.md @@ -85,15 +85,29 @@ Set the environment variable `KVIKIO_COMPAT_MODE` to enable/disable compatibilit - when running in Windows Subsystem for Linux (WSL). - when `/run/udev` isn't readable, which typically happens when running inside a docker image not launched with `--volume /run/udev:/run/udev:ro`. +This setting can also be controlled by `defaults::compat_mode()` and `defaults::compat_mode_reset()`. + + #### Thread Pool (KVIKIO_NTHREADS) KvikIO can use multiple threads for IO automatically. Set the environment variable `KVIKIO_NTHREADS` to the number of threads in the thread pool. If not set, the default value is 1. +This setting can also be controlled by `defaults::thread_pool_nthreads()` and `defaults::thread_pool_nthreads_reset()`. + #### Task Size (KVIKIO_TASK_SIZE) KvikIO splits parallel IO operations into multiple tasks. Set the environment variable `KVIKIO_TASK_SIZE` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB). +This setting can also be controlled by `defaults::task_size()` and `defaults::task_size_reset()`. + #### GDS Threshold (KVIKIO_GDS_THRESHOLD) In order to improve performance of small IO, `.pread()` and `.pwrite()` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable `KVIKIO_GDS_THRESHOLD` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB). +This setting can also be controlled by `defaults::gds_threshold()` and `defaults::gds_threshold_reset()`. + +#### Size of the Bounce Buffer (KVIKIO_GDS_THRESHOLD) +KvikIO might have to use an intermediate host buffer when copying between file and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to size (in bytes) of this "bounce" buffer. If not set, the default value is 16777216 (16 MiB). + +This setting can also be controlled by `defaults::bounce_buffer_size()` and `defaults::bounce_buffer_size_reset()`. + ## Example diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst index 2d03eb2f87..0302fe8aac 100644 --- a/docs/source/runtime_settings.rst +++ b/docs/source/runtime_settings.rst @@ -10,17 +10,29 @@ Set the environment variable ``KVIKIO_COMPAT_MODE`` to enable/disable compatibil * when running in Windows Subsystem for Linux (WSL). * when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``. +This setting can also be controlled by :py:func:`kvikio.defaults.compat_mode`, :py:func:`kvikio.defaults.compat_mode_reset`, and :py:func:`kvikio.defaults.set_compat_mode`. + Thread Pool ``KVIKIO_NTHREADS`` ------------------------------- KvikIO can use multiple threads for IO automatically. Set the environment variable ``KVIKIO_NTHREADS`` to the number of threads in the thread pool. If not set, the default value is 1. +This setting can also be controlled by :py:func:`kvikio.defaults.get_num_threads`, :py:func:`kvikio.defaults.num_threads_reset`, and :py:func:`kvikio.defaults.set_num_threads`. Task Size ``KVIKIO_TASK_SIZE`` ------------------------------ KvikIO splits parallel IO operations into multiple tasks. Set the environment variable ``KVIKIO_TASK_SIZE`` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB). +This setting can also be controlled by :py:func:`kvikio.defaults.task_size`, :py:func:`kvikio.defaults.task_size_reset`, and :py:func:`kvikio.defaults.set_task_size`. GDS Threshold ``KVIKIO_GDS_THRESHOLD`` -------------------------------------- In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB). + +This setting can also be controlled by :py:func:`kvikio.defaults.gds_threshold`, :py:func:`kvikio.defaults.gds_threshold_reset`, and :py:func:`kvikio.defaults.set_gds_threshold`. + +Size of the Bounce Buffer ``KVIKIO_BOUNCE_BUFFER_SIZE`` +------------------------------------------------------- +KvikIO might have to use an intermediate host buffer when copying between file and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to size (in bytes) of this "bounce" buffer. If not set, the default value is 16777216 (16 MiB). + +This setting can also be controlled by :py:func:`kvikio.defaults.bounce_buffer_size`, :py:func:`kvikio.defaults.bounce_buffer_size_reset`, and :py:func:`kvikio.defaults.set_bounce_buffer_size`. From a60011ae3651ee79deed7d7f4d72bae36507cdce Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 15:57:33 -0400 Subject: [PATCH 64/88] Get aws-sdk-cpp through CPM --- cpp/CMakeLists.txt | 8 ++--- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 33 +++++++++++++++++++ .../0001-Don-t-set-CMP0077-to-OLD.patch | 26 +++++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake create mode 100644 cpp/cmake/thirdparty/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2634becc82..ee113ba4af 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -59,14 +59,10 @@ rapids_find_package( # If AWSSDK isn't found, the Cython module remote_handle.pyx isn't built and C++ users shouldn't # include if(KvikIO_AWSSDK_SUPPORT) - rapids_find_package( - AWSSDK REQUIRED COMPONENTS s3 - BUILD_EXPORT_SET kvikio-exports - INSTALL_EXPORT_SET kvikio-exports - ) + include(cmake/thirdparty/get_aws_sdk_cpp.cmake) endif() -if(AWSSDK_FOUND) +if(TARGET aws-cpp-sdk-s3) get_property( _lib_type TARGET aws-cpp-sdk-s3 diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake new file mode 100644 index 0000000000..eb08e1afad --- /dev/null +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -0,0 +1,33 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds aws-sdk-cpp and sets any additional necessary environment variables. +function(find_and_configure_aws_sdk_cpp) + include(${rapids-cmake-dir}/cpm/find.cmake) + + rapids_cpm_find( + AWSSDK 1.11.393 + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports + COMPONENTS S3 + GLOBAL_TARGETS aws-cpp-sdk-s3 + CPM_ARGS + GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git + GIT_TAG 1.11.393 + PATCH_COMMAND ${CMAKE_COMMAND} -E env GIT_COMMITTER_NAME=rapids-cmake GIT_COMMITTER_EMAIL=rapids.cmake@rapids.ai git am ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch + OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" + ) +endfunction() + +find_and_configure_aws_sdk_cpp() diff --git a/cpp/cmake/thirdparty/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch b/cpp/cmake/thirdparty/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch new file mode 100644 index 0000000000..b1f4168436 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch @@ -0,0 +1,26 @@ +From 7b24166a73e422e65b725ffcb0acd20ab493fac0 Mon Sep 17 00:00:00 2001 +From: Kyle Edwards +Date: Wed, 28 Aug 2024 15:32:07 -0400 +Subject: [PATCH] Don't set CMP0077 to OLD + +--- + CMakeLists.txt | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index c17ff8a07b1..b30bc81b6df 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -13,10 +13,6 @@ if (LEGACY_BUILD) + "update the build flags as mentioned in README.md and set -DLEGACY_BUILD=OFF. " + "The legacy support will be removed at 1.12.0 release.") + +- if (POLICY CMP0077) +- cmake_policy(SET CMP0077 OLD) # CMP0077: option() honors normal variables. Introduced in 3.13 +- endif () +- + get_filename_component(AWS_NATIVE_SDK_ROOT "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + + # Cmake invocation variables: +-- +2.34.1 From 0d68e4974179aa0c094482bbcc1198e00915b1b7 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 15:59:11 -0400 Subject: [PATCH 65/88] Enable devcontainers --- .github/workflows/pr.yaml | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 17561466c1..c7e2abbbbd 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -61,17 +61,16 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" - # TODO: uncomment when https://github.com/rapidsai/kvikio/issues/446 has been fixed. - # devcontainer: - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 - # with: - # arch: '["amd64"]' - # cuda: '["12.5"]' - # build_command: | - # sccache -z; - # build-all --verbose; - # sccache -s; + devcontainer: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 + with: + arch: '["amd64"]' + cuda: '["12.5"]' + build_command: | + sccache -z; + build-all --verbose; + sccache -s; wheel-cpp-build: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 From 72812cc8ab71d73926d990b07186fc53ba2f7b9c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 15:59:50 -0400 Subject: [PATCH 66/88] Formatting --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index eb08e1afad..92eb5c7234 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -25,7 +25,15 @@ function(find_and_configure_aws_sdk_cpp) CPM_ARGS GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git GIT_TAG 1.11.393 - PATCH_COMMAND ${CMAKE_COMMAND} -E env GIT_COMMITTER_NAME=rapids-cmake GIT_COMMITTER_EMAIL=rapids.cmake@rapids.ai git am ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch + PATCH_COMMAND + ${CMAKE_COMMAND} + -E + env + GIT_COMMITTER_NAME=rapids-cmake + GIT_COMMITTER_EMAIL=rapids.cmake@rapids.ai + git + am + ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" ) endfunction() From c4ccd9ef2c41c0acb8e27671a177e09bb58db49b Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 16:01:24 -0400 Subject: [PATCH 67/88] pr-builder --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c7e2abbbbd..4499514060 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,7 +18,7 @@ jobs: - conda-python-build - conda-python-tests - docs-build - # - devcontainer + - devcontainer - wheel-cpp-build - wheel-python-build - wheel-python-tests From 4f765d023be66427576ffe1cbd4ccf2eaf23b3b0 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 16:03:23 -0400 Subject: [PATCH 68/88] Style --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 92eb5c7234..5310fd5004 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -18,9 +18,9 @@ function(find_and_configure_aws_sdk_cpp) rapids_cpm_find( AWSSDK 1.11.393 + COMPONENTS S3 BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports - COMPONENTS S3 GLOBAL_TARGETS aws-cpp-sdk-s3 CPM_ARGS GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git From 355d1924f1ac11c7df763ef6a8739161ea9ff01c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 16:04:25 -0400 Subject: [PATCH 69/88] Argument ordering --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 5310fd5004..3f43f57b51 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -18,10 +18,10 @@ function(find_and_configure_aws_sdk_cpp) rapids_cpm_find( AWSSDK 1.11.393 + GLOBAL_TARGETS aws-cpp-sdk-s3 COMPONENTS S3 BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports - GLOBAL_TARGETS aws-cpp-sdk-s3 CPM_ARGS GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git GIT_TAG 1.11.393 From 500681a35b451d3c947b38ed94450abc568f31ef Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 16:06:44 -0400 Subject: [PATCH 70/88] Style --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 3f43f57b51..2971f22597 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -18,8 +18,7 @@ function(find_and_configure_aws_sdk_cpp) rapids_cpm_find( AWSSDK 1.11.393 - GLOBAL_TARGETS aws-cpp-sdk-s3 - COMPONENTS S3 + GLOBAL_TARGETS aws-cpp-sdk-s3 COMPONENTS S3 BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports CPM_ARGS From d2a49c9b398b6de3376b661f879613fdd42b4cd3 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 28 Aug 2024 16:14:29 -0400 Subject: [PATCH 71/88] Relax aws-sdk-cpp version --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 2971f22597..5883ff7289 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -17,7 +17,7 @@ function(find_and_configure_aws_sdk_cpp) include(${rapids-cmake-dir}/cpm/find.cmake) rapids_cpm_find( - AWSSDK 1.11.393 + AWSSDK 1.11.267 GLOBAL_TARGETS aws-cpp-sdk-s3 COMPONENTS S3 BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports From bd836553c7d5d4572c94c490c5a143dcdf1e9999 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 09:26:43 -0400 Subject: [PATCH 72/88] Find debug --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 5883ff7289..c9f9a64ac8 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -16,6 +16,7 @@ function(find_and_configure_aws_sdk_cpp) include(${rapids-cmake-dir}/cpm/find.cmake) + set(CMAKE_FIND_DEBUG_MODE ON) rapids_cpm_find( AWSSDK 1.11.267 GLOBAL_TARGETS aws-cpp-sdk-s3 COMPONENTS S3 @@ -35,6 +36,7 @@ function(find_and_configure_aws_sdk_cpp) ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" ) + set(CMAKE_FIND_DEBUG_MODE) endfunction() find_and_configure_aws_sdk_cpp() From 2687cc69cf518c6afd772528a80607ea4fec4fd1 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 09:38:58 -0400 Subject: [PATCH 73/88] CPM_USE_LOCAL_PACKAGES --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index c9f9a64ac8..3156233ccc 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -16,6 +16,7 @@ function(find_and_configure_aws_sdk_cpp) include(${rapids-cmake-dir}/cpm/find.cmake) + set(CPM_USE_LOCAL_PACKAGES ON) set(CMAKE_FIND_DEBUG_MODE ON) rapids_cpm_find( AWSSDK 1.11.267 From 8fe82b9c3fe3804bdd32a3e2af93ef2a417bece1 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 09:45:17 -0400 Subject: [PATCH 74/88] No debug find --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 3156233ccc..a05f9002c6 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -16,8 +16,9 @@ function(find_and_configure_aws_sdk_cpp) include(${rapids-cmake-dir}/cpm/find.cmake) + # Attempt to use find_package() - the patch is only needed if building from source set(CPM_USE_LOCAL_PACKAGES ON) - set(CMAKE_FIND_DEBUG_MODE ON) + rapids_cpm_find( AWSSDK 1.11.267 GLOBAL_TARGETS aws-cpp-sdk-s3 COMPONENTS S3 @@ -37,7 +38,6 @@ function(find_and_configure_aws_sdk_cpp) ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" ) - set(CMAKE_FIND_DEBUG_MODE) endfunction() find_and_configure_aws_sdk_cpp() From fdb9a593e39b9407f3a9f9e0939fbe8bc49b2f2c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 10:44:18 -0400 Subject: [PATCH 75/88] Fix AWS linking --- cpp/CMakeLists.txt | 7 ++++--- python/kvikio/kvikio/_lib/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ee113ba4af..0afcb888e0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -153,8 +153,8 @@ target_include_directories( target_link_libraries( kvikio INTERFACE Threads::Threads ${CMAKE_DL_LIBS} nvtx3::nvtx3-cpp BS::thread_pool ) -if(AWSSDK_FOUND) - target_link_libraries(kvikio INTERFACE aws-cpp-sdk-s3) +if(TARGET aws-cpp-sdk-s3) + target_link_libraries(kvikio INTERFACE $) target_compile_definitions(kvikio INTERFACE $) endif() target_compile_features(kvikio INTERFACE cxx_std_17) @@ -247,7 +247,8 @@ if(NOT already_set_kvikio) if(KvikIO_AWSSDK_SUPPORT) find_package(AWSSDK COMPONENTS s3 QUIET) endif() - if(AWSSDK_FOUND) + if(TARGET aws-cpp-sdk-s3) + target_link_libraries(kvikio::kvikio INTERFACE aws-cpp-sdk-s3) target_compile_definitions(kvikio::kvikio INTERFACE $) endif() endif() diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index 97d068cb10..f6e14ba1dc 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -17,7 +17,7 @@ set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_ha libnvcomp.pyx libnvcomp_ll.pyx ) -if(AWSSDK_FOUND) +if(TARGET aws-cpp-sdk-s3) message(STATUS "Building remote_handle.pyx (aws-cpp-sdk-s3 found)") list(APPEND cython_modules remote_handle.pyx) else() From 4e4389f7868460f27f3350dfc01de79711550f18 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 11:50:39 -0400 Subject: [PATCH 76/88] Re-run CI From cebc0a8e8ea0d35b7df7ddf9e092354ac0a9a8a3 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 13:28:35 -0400 Subject: [PATCH 77/88] Install libcurl4-openssl-dev for pip devcontainers --- .devcontainer/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 9d35e3f97f..a21e66c170 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -7,6 +7,11 @@ FROM ${BASE} as pip-base ENV DEFAULT_VIRTUAL_ENV=rapids +RUN apt update -y \ + && DEBIAN_FRONTEND=noninteractive apt install -y \ + libcurl4-openssl-dev \ + && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*; + FROM ${BASE} as conda-base ENV DEFAULT_CONDA_ENV=rapids From 61a5d5042957d31d6e845a97a6f51a0a2cc2bf9a Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 15:10:30 -0400 Subject: [PATCH 78/88] Link against aws-cpp-sdk-core --- python/kvikio/kvikio/_lib/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index f6e14ba1dc..8fdc07d401 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -17,9 +17,10 @@ set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_ha libnvcomp.pyx libnvcomp_ll.pyx ) -if(TARGET aws-cpp-sdk-s3) +if(TARGET aws-cpp-sdk-core AND TARGET aws-cpp-sdk-s3) message(STATUS "Building remote_handle.pyx (aws-cpp-sdk-s3 found)") list(APPEND cython_modules remote_handle.pyx) + set(aws_cpp_sdk_core_dep aws-cpp-sdk-core) else() message(WARNING "Skipping remote_handle.pyx (aws-cpp-sdk-s3 not found or disabled)") endif() @@ -27,5 +28,5 @@ endif() rapids_cython_create_modules( CXX SOURCE_FILES "${cython_modules}" - LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp + LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp ${aws_cpp_sdk_core_dep} ) From d55030a66c1e6cb39210116482ab0bf013dc4baa Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 16:02:20 -0400 Subject: [PATCH 79/88] Try excluding curl --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index a05f9002c6..2b8384f68a 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -35,8 +35,9 @@ function(find_and_configure_aws_sdk_cpp) GIT_COMMITTER_EMAIL=rapids.cmake@rapids.ai git am + --no-gpg-sign ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch - OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" + OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" "NO_HTTP_CLIENT ON" ) endfunction() From 21236664f07defe7e347131f5bc3f15d301b4838 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 29 Aug 2024 16:04:04 -0400 Subject: [PATCH 80/88] Style --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 2b8384f68a..3d1fedd28b 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -37,7 +37,8 @@ function(find_and_configure_aws_sdk_cpp) am --no-gpg-sign ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch - OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" "NO_HTTP_CLIENT ON" + OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" + "NO_HTTP_CLIENT ON" ) endfunction() From bb445a39618391166dce6dac5b6f98b5c7b19b06 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 30 Aug 2024 10:53:20 -0400 Subject: [PATCH 81/88] Try AWS's HTTP client --- cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake index 3d1fedd28b..6d32889e1f 100644 --- a/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake +++ b/cpp/cmake/thirdparty/get_aws_sdk_cpp.cmake @@ -38,7 +38,7 @@ function(find_and_configure_aws_sdk_cpp) --no-gpg-sign ${CMAKE_CURRENT_LIST_DIR}/patches/aws-sdk-cpp/0001-Don-t-set-CMP0077-to-OLD.patch OPTIONS "BUILD_ONLY s3" "BUILD_SHARED_LIBS OFF" "ENABLE_TESTING OFF" "ENABLE_UNITY_BUILD ON" - "NO_HTTP_CLIENT ON" + "USE_CRT_HTTP_CLIENT ON" ) endfunction() From 4701b52491f7022d6da486e5a429fdac1ed1570e Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 30 Aug 2024 12:16:51 -0400 Subject: [PATCH 82/88] No need to install libcurl --- .devcontainer/Dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a21e66c170..9d35e3f97f 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -7,11 +7,6 @@ FROM ${BASE} as pip-base ENV DEFAULT_VIRTUAL_ENV=rapids -RUN apt update -y \ - && DEBIAN_FRONTEND=noninteractive apt install -y \ - libcurl4-openssl-dev \ - && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*; - FROM ${BASE} as conda-base ENV DEFAULT_CONDA_ENV=rapids From 2bd411e9b9bca8b8671fba9de263f5e3f8337718 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 3 Sep 2024 08:58:06 +0200 Subject: [PATCH 83/88] libcudf_s3_io --- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 8 ++++---- python/kvikio/tests/test_benchmarks.py | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index 015854dcca..d7ec858eee 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -103,10 +103,10 @@ def run() -> float: yield run() -def run_cudf(args, use_kvikio_s3: bool): +def run_cudf(args, libcudf_s3_io: bool): import cudf - cudf.set_option("native_s3_io", use_kvikio_s3) + cudf.set_option("libcudf_s3_io", libcudf_s3_io) # Upload data to S3 server create_client_and_bucket() @@ -127,8 +127,8 @@ def run() -> float: API = { "cupy-kvikio": partial(run_numpy_like, xp=cupy), "numpy-kvikio": partial(run_numpy_like, xp=numpy), - "cudf-kvikio": partial(run_cudf, use_kvikio_s3=True), - "cudf-fsspec": partial(run_cudf, use_kvikio_s3=False), + "cudf-kvikio": partial(run_cudf, libcudf_s3_io=True), + "cudf-fsspec": partial(run_cudf, libcudf_s3_io=False), } diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index ea35b7793e..7c479256ef 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -82,6 +82,21 @@ def test_zarr_io(run_cmd, tmp_path, api): assert retcode == 0 +def skipif_libcudf_s3_io_option_is_not_available() -> None: + """Call pytest.skip() if cudf or its "libcudf_s3_io" option isn't available + + See + """ + cudf = pytest.importorskip("cudf") + try: + cudf.get_option("libcudf_s3_io") + except KeyError: + pytest.skip( + """cudf doesn't has the "libcudf_s3_io" option, """ + "see " + ) + + @pytest.mark.parametrize( "api", [ @@ -102,9 +117,8 @@ def test_aws_s3_io(run_cmd, api): import boto3 # noqa: F401 import moto # noqa: F401 - # TODO: change to import once https://github.com/rapidsai/cudf/pull/16499 is merged if "cudf" in api: - pytest.importorskip("cudf") + skipif_libcudf_s3_io_option_is_not_available() retcode = run_cmd( cmd=[ From b978323edd0578e50d5410f790dc28c19b77c8a4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 15:01:20 +0200 Subject: [PATCH 84/88] test: ignore deprecation warning --- python/kvikio/tests/test_aws_s3.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 7d7f15ee24..8abf02ab3f 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -4,6 +4,7 @@ import multiprocessing as mp import socket import time +import warnings from contextlib import contextmanager import pytest @@ -68,16 +69,20 @@ def s3_base(endpoint_ip, endpoint_port): def s3_context(s3_base, bucket, files=None): if files is None: files = {} - client = boto3.client("s3", endpoint_url=s3_base) - client.create_bucket(Bucket=bucket, ACL="public-read-write") - for f, data in files.items(): - client.put_object(Bucket=bucket, Key=f, Body=data) - yield kvikio.S3Context(s3_base) - for f, data in files.items(): - try: - client.delete_object(Bucket=bucket, Key=f) - except Exception: - pass + with warnings.catch_warnings(): + # boto3 calls `datetime.datetime.utcnow()`, which is deprecated + # in Python v3.12. + warnings.filterwarnings("ignore", category=DeprecationWarning) + client = boto3.client("s3", endpoint_url=s3_base) + client.create_bucket(Bucket=bucket, ACL="public-read-write") + for f, data in files.items(): + client.put_object(Bucket=bucket, Key=f, Body=data) + yield kvikio.S3Context(s3_base) + for f, data in files.items(): + try: + client.delete_object(Bucket=bucket, Key=f) + except Exception: + pass @pytest.mark.parametrize("size", [10, 100, 1000]) From 00304e718c6d9434afcfcca569c5defdfcff3283 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 15:01:48 +0200 Subject: [PATCH 85/88] test: remove the kvikio._lib.remote_handle trigger --- python/kvikio/tests/test_aws_s3.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 8abf02ab3f..858765beb6 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -12,9 +12,6 @@ import kvikio import kvikio.defaults -# TODO: remove before PR merge. Trigger CI error if the remote module wasn't built -import kvikio._lib.remote_handle # isort: skip - pytestmark = pytest.mark.skipif( not kvikio.is_remote_file_available(), reason="cannot test remote IO, please build KvikIO with with AWS S3 support", From f9b49f2997c0af55ac39fe1144d945df0fda66ad Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 15:28:35 +0200 Subject: [PATCH 86/88] Revert "test: ignore deprecation warning" This reverts commit b978323edd0578e50d5410f790dc28c19b77c8a4. --- python/kvikio/tests/test_aws_s3.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/python/kvikio/tests/test_aws_s3.py b/python/kvikio/tests/test_aws_s3.py index 858765beb6..7e2490cb74 100644 --- a/python/kvikio/tests/test_aws_s3.py +++ b/python/kvikio/tests/test_aws_s3.py @@ -4,7 +4,6 @@ import multiprocessing as mp import socket import time -import warnings from contextlib import contextmanager import pytest @@ -66,20 +65,16 @@ def s3_base(endpoint_ip, endpoint_port): def s3_context(s3_base, bucket, files=None): if files is None: files = {} - with warnings.catch_warnings(): - # boto3 calls `datetime.datetime.utcnow()`, which is deprecated - # in Python v3.12. - warnings.filterwarnings("ignore", category=DeprecationWarning) - client = boto3.client("s3", endpoint_url=s3_base) - client.create_bucket(Bucket=bucket, ACL="public-read-write") - for f, data in files.items(): - client.put_object(Bucket=bucket, Key=f, Body=data) - yield kvikio.S3Context(s3_base) - for f, data in files.items(): - try: - client.delete_object(Bucket=bucket, Key=f) - except Exception: - pass + client = boto3.client("s3", endpoint_url=s3_base) + client.create_bucket(Bucket=bucket, ACL="public-read-write") + for f, data in files.items(): + client.put_object(Bucket=bucket, Key=f, Body=data) + yield kvikio.S3Context(s3_base) + for f, data in files.items(): + try: + client.delete_object(Bucket=bucket, Key=f) + except Exception: + pass @pytest.mark.parametrize("size", [10, 100, 1000]) From 04036869e38769f05691f448f50adc6d191609fa Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 6 Sep 2024 15:30:49 +0200 Subject: [PATCH 87/88] pytest: ignore deprecation warning in botocore --- python/kvikio/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index 3fd543479c..8a6b7d4b36 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -140,4 +140,5 @@ regex = "(?P.*)" filterwarnings = [ "error", "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache", + "ignore::DeprecationWarning:botocore.*", ] From 000126516db430988ab9af5ee1576ca3fe6afe27 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 9 Sep 2024 10:18:49 +0200 Subject: [PATCH 88/88] Apply suggestions from code review Co-authored-by: Bradley Dice --- cpp/include/kvikio/remote_handle.hpp | 6 +++--- python/kvikio/kvikio/benchmarks/aws_s3_io.py | 2 +- python/kvikio/kvikio/remote_file.py | 8 ++++---- python/kvikio/tests/test_benchmarks.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index b91bff05ec..0e0fc09f83 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -129,7 +129,7 @@ class S3Context { * * @throws std::runtime_error If failing to authenticate to the S3 server. * - * @param endpoint_override If not empty, the address of the S3 server. This takes precedences + * @param endpoint_override If not empty, the address of the S3 server. This takes precedence * over the AWS system configuration including the `AWS_ENDPOINT_URL` environment variable. */ S3Context(std::string const& endpoint_override = "") : _shutdown_s3_api{true} @@ -140,7 +140,7 @@ class S3Context { Aws::SDKOptions options; Aws::InitAPI(options); - // Create a client config where `endpoint_override` takes precedences over `AWS_ENDPOINT_URL` + // Create a client config where `endpoint_override` takes precedence over `AWS_ENDPOINT_URL` Aws::Client::ClientConfiguration config; char const* ep = std::getenv("AWS_ENDPOINT_URL"); if (!endpoint_override.empty()) { @@ -318,7 +318,7 @@ class RemoteHandle { * @param buf Pointer to host or device memory. * @param size Number of bytes to read. * @param file_offset File offset in bytes. - * @return Number of bytes read, which is `size` always. + * @return Number of bytes read, which is always `size`. */ std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0) { diff --git a/python/kvikio/kvikio/benchmarks/aws_s3_io.py b/python/kvikio/kvikio/benchmarks/aws_s3_io.py index d7ec858eee..4e88cd13d5 100644 --- a/python/kvikio/kvikio/benchmarks/aws_s3_io.py +++ b/python/kvikio/kvikio/benchmarks/aws_s3_io.py @@ -38,7 +38,7 @@ def start_s3_server(lifetime: int): from moto.server import ThreadedMotoServer # Silence the activity info from ThreadedMotoServer - sys.stderr = open("/dev/null", "w") + sys.stderr = open(os.devnull, "w") url = urlparse(os.environ["AWS_ENDPOINT_URL"]) server = ThreadedMotoServer(ip_address=url.hostname, port=url.port) server.start() diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 9779030ce5..dde7bf9f2b 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -67,7 +67,7 @@ def __init__(self, context: S3Context, bucket_name: str, object_name: str): Parameters ---------- context - The S3 context used for the connection to the remove server. + The S3 context used for the connection to the remote server. bucket_name Name of the bucket. object_name @@ -84,7 +84,7 @@ def from_url(cls, context: S3Context, url: str) -> RemoteFile: Parameters ---------- context - The S3 context used for the connection to the remove server. + The S3 context used for the connection to the remote server. url URL to the remote file. @@ -118,7 +118,7 @@ def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: Parameters ---------- - buf: buffer-like or array-like + buf : buffer-like or array-like Device or host buffer to read into. size Size in bytes to read. @@ -136,7 +136,7 @@ def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFutu Parameters ---------- - buf: buffer-like or array-like + buf : buffer-like or array-like Device or host buffer to read into. size Size in bytes to read. diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 7c479256ef..5c597ce253 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -92,7 +92,7 @@ def skipif_libcudf_s3_io_option_is_not_available() -> None: cudf.get_option("libcudf_s3_io") except KeyError: pytest.skip( - """cudf doesn't has the "libcudf_s3_io" option, """ + "cudf doesn't have the 'libcudf_s3_io' option, " "see " ) @@ -113,7 +113,7 @@ def test_aws_s3_io(run_cmd, api): pytest.skip( "cannot test remote IO, please build KvikIO with with AWS S3 support" ) - # Fail early if benchmark dependencies isn't available + # Fail early if benchmark dependencies aren't available import boto3 # noqa: F401 import moto # noqa: F401