From a01dc53fdff989ebdcd237839b37c503f6a26c75 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 17 Sep 2024 14:09:04 +0200 Subject: [PATCH 01/64] Impl. basic remote handle --- cpp/CMakeLists.txt | 17 ++ cpp/examples/CMakeLists.txt | 4 + cpp/examples/basic_io.cpp | 4 + cpp/include/kvikio/remote_handle.hpp | 211 ++++++++++++++++++++ python/kvikio/kvikio/__init__.py | 9 +- python/kvikio/kvikio/_lib/CMakeLists.txt | 9 +- python/kvikio/kvikio/_lib/remote_handle.pyx | 39 ++++ python/kvikio/kvikio/remote_file.py | 100 ++++++++++ python/kvikio/tests/test_http_io.py | 50 +++++ 9 files changed, 441 insertions(+), 2 deletions(-) create mode 100644 cpp/include/kvikio/remote_handle.hpp create mode 100644 python/kvikio/kvikio/_lib/remote_handle.pyx create mode 100644 python/kvikio/kvikio/remote_file.py create mode 100644 python/kvikio/tests/test_http_io.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6a4f89a0d7..82abe38309 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,6 +37,7 @@ rapids_cmake_build_type(Release) # build options option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON) +option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote io support" ON) rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) @@ -49,6 +50,12 @@ rapids_find_package( INSTALL_EXPORT_SET kvikio-exports ) +rapids_find_package( + CURL REQUIRED + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports +) + rapids_find_package( CUDAToolkit BUILD_EXPORT_SET kvikio-exports @@ -137,6 +144,10 @@ target_link_libraries( kvikio INTERFACE Threads::Threads BS::thread_pool ${CMAKE_DL_LIBS} $ ) +if(TARGET CURL::libcurl) + target_link_libraries(kvikio INTERFACE $) + target_compile_definitions(kvikio INTERFACE $) +endif() target_compile_features(kvikio INTERFACE cxx_std_17) # optionally build examples @@ -223,6 +234,12 @@ if(NOT already_set_kvikio) target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUFILE_STREAM_API_FOUND) endif() endif() + + if(TARGET CURL::libcurl) + target_link_libraries(kvikio::kvikio INTERFACE $) + target_compile_definitions(kvikio::kvikio INTERFACE $) + endif() + endif() ]=] ) diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt index c12ddb2e52..284590e943 100644 --- a/cpp/examples/CMakeLists.txt +++ b/cpp/examples/CMakeLists.txt @@ -14,6 +14,8 @@ set(TEST_INSTALL_PATH bin/tests/libkvikio) +# Example: basic_io + if(CUDAToolkit_FOUND) add_executable(BASIC_IO_TEST basic_io.cpp) set_target_properties(BASIC_IO_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") @@ -35,6 +37,8 @@ else() message(STATUS "Cannot build the basic_io example when CUDA is not found") endif() +# Example: basic_no_cuda + add_executable(BASIC_NO_CUDA_TEST basic_no_cuda.cpp) set_target_properties(BASIC_NO_CUDA_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") target_include_directories(BASIC_NO_CUDA_TEST PRIVATE ../include) diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp index 3a4ab892ad..5f0fc74a07 100644 --- a/cpp/examples/basic_io.cpp +++ b/cpp/examples/basic_io.cpp @@ -25,6 +25,7 @@ #include #include #include +#include using namespace std; @@ -291,4 +292,7 @@ int main() check(res.check_bytes_done() == SIZE); cout << "File async read: " << res.check_bytes_done() << endl; } + { + kvikio::RemoteHandle("hej", 42); + } } diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp new file mode 100644 index 0000000000..3e963dd45a --- /dev/null +++ b/cpp/include/kvikio/remote_handle.hpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef KVIKIO_LIBCURL_FOUND +#error "cannot include , configuration did not find libcurl" +#endif + +#include +#include +#include +#include + +#include + +#include +#include + +namespace kvikio { +namespace detail { + +/** + * @brief Singleton class to initialize and cleanup the global state of libcurl + * + * https://curl.se/libcurl/c/libcurl.html + * In a C++ module, it is common to deal with the global constant situation by defining a special + * class that represents the global constant environment of the module. A program always has exactly + * one object of the class, in static storage. That way, the program automatically calls the + * constructor of the object as the program starts up and the destructor as it terminates. As the + * author of this libcurl-using module, you can make the constructor call curl_global_init and the + * destructor call curl_global_cleanup and satisfy libcurl's requirements without your user having + * to think about it. (Caveat: If you are initializing libcurl from a Windows DLL you should not + * initialize it from DllMain or a static initializer because Windows holds the loader lock during + * that time and it could cause a deadlock.) + */ +class LibCurl { + private: + std::stack _free_curl_handles{}; + + LibCurl() + { + CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT); + if (err != CURLE_OK) { + throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err)); + } + curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW); + if ((ver->features & CURL_VERSION_THREADSAFE) == 0) { + throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled"); + } + } + ~LibCurl() noexcept + { + // clean up all retained easy handles + while (!_free_curl_handles.empty()) { + curl_easy_cleanup(_free_curl_handles.top()); + _free_curl_handles.pop(); + } + curl_global_cleanup(); + } + + LibCurl(LibCurl const&) = delete; + LibCurl& operator=(LibCurl const&) = delete; + LibCurl(LibCurl&& o) = delete; + LibCurl& operator=(LibCurl&& o) = delete; + + public: + void put(CURL* handle) { _free_curl_handles.push(handle); } + + static LibCurl& instance() + { + static LibCurl _instance; + return _instance; + } + + CURL* get() + { + // Check if we have a handle available. + if (!_free_curl_handles.empty()) { + CURL* ret = _free_curl_handles.top(); + _free_curl_handles.pop(); + curl_easy_reset(ret); + return ret; + } + // If not, we create a new handle. + CURL* ret = curl_easy_init(); + if (ret == nullptr) { throw std::runtime_error("libcurl: call to curl_easy_init() failed"); } + return ret; + } +}; + +/** + * @brief A wrapper of a curl easy handle pointer `CURL*`. + */ +class CurlHandle { + private: + char _errbuf[CURL_ERROR_SIZE]; + CURL* _handle; + std::string _source_file; + std::string _source_line; + + public: + CurlHandle(CURL* handle, std::string source_file, std::string source_line) + : _handle{handle}, _source_file(std::move(source_file)), _source_line(std::move(source_line)) + { + } + ~CurlHandle() noexcept { detail::LibCurl::instance().put(_handle); } + + CurlHandle(CurlHandle const&) = delete; + CurlHandle& operator=(CurlHandle const&) = delete; + CurlHandle(CurlHandle&& o) = delete; + CurlHandle& operator=(CurlHandle&& o) = delete; + + CURL* handle() noexcept { return _handle; } + + template + void setopt(OPT option, VAL value) + { + CURLcode err = curl_easy_setopt(handle(), option, value); + if (err != CURLE_OK) { + std::stringstream ss; + ss << "curl_easy_setopt() error near " << _source_file << ":" << _source_line; + ss << "(" << curl_easy_strerror(err) << ")"; + throw std::runtime_error(ss.str()); + } + } + void perform() + { + setopt(CURLOPT_ERRORBUFFER, _errbuf); + CURLcode err = curl_easy_perform(handle()); + if (err != CURLE_OK) { + std::string msg(_errbuf); + std::stringstream ss; + ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; + if (msg.empty()) { + ss << "(" << curl_easy_strerror(err) << ")"; + } else { + ss << "(" << msg << ")"; + } + throw std::runtime_error(ss.str()); + } + } +}; + +#define create_curl_handle() \ + kvikio::detail::CurlHandle( \ + kvikio::detail::LibCurl::instance().get(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) + +inline std::size_t get_file_size(std::string url) +{ + auto curl = create_curl_handle(); + + curl.setopt(CURLOPT_URL, url.c_str()); + curl.setopt(CURLOPT_NOBODY, 1L); + curl.setopt(CURLOPT_FAILONERROR, 1L); + curl.perform(); + + curl_off_t cl; + curl_easy_getinfo(curl.handle(), CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + + std::cout << "get_file_size(" << url << "): " << sizeof(curl_off_t) << std::endl; + + return cl; +} +} // namespace detail + +/** + * @brief Handle of remote file. + */ +class RemoteHandle { + private: + std::string _url; + std::size_t _nbytes; + + public: + RemoteHandle(std::string url, std::size_t nbytes) : _url(std::move(url)), _nbytes{nbytes} + { + auto curl = create_curl_handle(); + std::cout << "RemoteHandle() - nbytes: " << _nbytes << std::endl; + } + + RemoteHandle(std::string const& url) : RemoteHandle(url, detail::get_file_size(url)) {} + + RemoteHandle(RemoteHandle const&) = delete; + RemoteHandle& operator=(RemoteHandle const&) = delete; + RemoteHandle(RemoteHandle&& o) = delete; + RemoteHandle& operator=(RemoteHandle&& o) = delete; + + /** + * @brief Get the file size. + * + * Note, this is very fast, no communication needed. + * + * @return The number of bytes. + */ + [[nodiscard]] std::size_t nbytes() const noexcept { return _nbytes; } +}; + +} // namespace kvikio diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py index 883ac9e784..749d87ec1f 100644 --- a/python/kvikio/kvikio/__init__.py +++ b/python/kvikio/kvikio/__init__.py @@ -4,9 +4,16 @@ from kvikio._lib import driver_properties # type: ignore from kvikio._version import __git_commit__, __version__ from kvikio.cufile import CuFile +from kvikio.remote_file import RemoteFile, is_remote_file_available # TODO: Wrap nicely, maybe as a dataclass? DriverProperties = driver_properties.DriverProperties -__all__ = ["__git_commit__", "__version__", "CuFile"] +__all__ = [ + "__git_commit__", + "__version__", + "CuFile", + "RemoteFile", + "is_remote_file_available", +] diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index c77d8e3df1..775c0b76de 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -17,8 +17,15 @@ set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_ha libnvcomp.pyx libnvcomp_ll.pyx ) +if(TARGET CURL::libcurl) + message(STATUS "Building remote_handle.pyx (libcurl found)") + list(APPEND cython_modules remote_handle.pyx) +else() + message(WARNING "Skipping remote_handle.pyx (libcurl not found or disabled)") +endif() + rapids_cython_create_modules( CXX SOURCE_FILES "${cython_modules}" - LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp + LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp $ ) diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx new file mode 100644 index 0000000000..8990eb8cf7 --- /dev/null +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +# distutils: language = c++ +# cython: language_level=3 + +from typing import Optional + +from cython.operator cimport dereference as deref +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.string cimport string + +# from kvikio._lib.arr cimport parse_buffer_argument +# from kvikio._lib.future cimport IOFuture, _wrap_io_future, future + + +cdef extern from "" nogil: + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": + cpp_RemoteHandle(string url, size_t nbytes) except + + cpp_RemoteHandle(string url) except + + int nbytes() except + + + +cdef class RemoteFile: + cdef unique_ptr[cpp_RemoteHandle] _handle + + @classmethod + def from_url(cls, url: str, nbytes: Optional[int]): + cdef RemoteFile ret = RemoteFile() + cdef string u = str.encode(str(url)) + if nbytes is None: + ret._handle = make_unique[cpp_RemoteHandle](u) + return ret + cdef size_t n = nbytes + ret._handle = make_unique[cpp_RemoteHandle](u, n) + return ret + + def nbytes(self) -> int: + return deref(self._handle).nbytes() diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py new file mode 100644 index 0000000000..4d895350bb --- /dev/null +++ b/python/kvikio/kvikio/remote_file.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +from __future__ import annotations + +import functools +from typing import Optional + +from kvikio.cufile import IOFuture + + +@functools.cache +def is_remote_file_available() -> bool: + """Check if the remote module is available""" + try: + import kvikio._lib.remote_handle # noqa: F401 + except ImportError: + return False + else: + return True + + +@functools.cache +def _get_remote_module(): + """Get the remote module or raise an error""" + if not is_remote_file_available(): + raise RuntimeError( + "RemoteFile not available, please build KvikIO with AWS S3 support" + ) + import kvikio._lib.remote_handle + + return kvikio._lib.remote_handle + + +class RemoteFile: + """File handle of a remote file (currently, only AWS S3 is supported).""" + + def __init__(self, url: str, nbytes: Optional[int] = None): + """Open a remote file given a bucket and object name. + + Parameters + ---------- + url + URL to the remote file. + """ + self._handle = _get_remote_module().RemoteFile.from_url(url, nbytes) + + def __enter__(self) -> RemoteFile: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + pass + + def nbytes(self) -> int: + """Get the file size. + + Note, this is very fast, no communication needed. + + Returns + ------- + The number of bytes. + """ + return self._handle.nbytes() + + # def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: + # """Read from remote source into buffer (host or device memory) in parallel. + + # Parameters + # ---------- + # buf : buffer-like or array-like + # Device or host buffer to read into. + # size + # Size in bytes to read. + # file_offset + # Offset in the file to read from. + + # Returns + # ------- + # The size of bytes that were successfully read. + # """ + # return self.pread(buf, size, file_offset).get() + + # def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: + # """Read from remote source into buffer (host or device memory) in parallel. + + # Parameters + # ---------- + # buf : buffer-like or array-like + # Device or host buffer to read into. + # size + # Size in bytes to read. + # file_offset + # Offset in the file to read from. + + # Returns + # ------- + # Future that on completion returns the size of bytes that were successfully + # read. + # """ + # return IOFuture(self._handle.pread(buf, size, file_offset)) diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py new file mode 100644 index 0000000000..9bc8d01dd2 --- /dev/null +++ b/python/kvikio/tests/test_http_io.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + + +import functools +import multiprocessing as mp +import threading +import time +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + +import numpy as np +import pytest + +import kvikio +import kvikio.defaults + +pytestmark = pytest.mark.skipif( + not kvikio.is_remote_file_available(), + reason="cannot test remote IO, please build KvikIO with libcurl", +) + + +def start_http_server(queue: mp.Queue, tmpdir: str): + httpd = ThreadingHTTPServer( + ("127.0.0.1", 0), functools.partial(SimpleHTTPRequestHandler, directory=tmpdir) + ) + thread = threading.Thread(target=httpd.serve_forever) + thread.start() + queue.put(httpd.server_address) + time.sleep(60) + print("ThreadingHTTPServer shutting down because of timeout (60sec)") + + +@pytest.fixture # (scope="session") +def http_server(tmpdir): + """Fixture to set up http server in separate process""" + print(str(tmpdir)) + queue = mp.Queue() + p = mp.Process(target=start_http_server, args=(queue, str(tmpdir))) + p.start() + ip, port = queue.get() + yield f"http://{ip}:{port}" + p.kill() + + +def test_file_size(http_server, tmpdir): + a = np.arange(100) + a.tofile(tmpdir / "a") + with kvikio.RemoteFile(f"{http_server}/a") as f: + assert f.nbytes() == a.nbytes From a6467d5fb43e3a80287affa03c19a5288ea05072 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 14:59:13 +0200 Subject: [PATCH 02/64] RemoteHandle::read() --- cpp/include/kvikio/remote_handle.hpp | 143 +++++++++++++++++++- python/kvikio/kvikio/_lib/remote_handle.pyx | 35 ++++- python/kvikio/kvikio/remote_file.py | 72 +++++----- python/kvikio/tests/test_http_io.py | 35 ++++- 4 files changed, 238 insertions(+), 47 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 3e963dd45a..45e44c7303 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -19,6 +19,7 @@ #error "cannot include , configuration did not find libcurl" #endif +#include #include #include #include @@ -28,6 +29,9 @@ #include #include +#include +#include +#include namespace kvikio { namespace detail { @@ -152,6 +156,17 @@ class CurlHandle { throw std::runtime_error(ss.str()); } } + template + void getinfo(INFO info, VALUE value) + { + CURLcode err = curl_easy_getinfo(handle(), info, value); + if (err != CURLE_OK) { + std::stringstream ss; + ss << "curl_easy_getinfo() error near " << _source_file << ":" << _source_line; + ss << "(" << curl_easy_strerror(err) << ")"; + throw std::runtime_error(ss.str()); + } + } }; #define create_curl_handle() \ @@ -168,12 +183,70 @@ inline std::size_t get_file_size(std::string url) curl.perform(); curl_off_t cl; - curl_easy_getinfo(curl.handle(), CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + return cl; +} - std::cout << "get_file_size(" << url << "): " << sizeof(curl_off_t) << std::endl; +struct CallbackContext { + char* buf; + std::size_t size; + std::size_t offset; +}; - return cl; +inline std::size_t callback_host_memory(char* data, + std::size_t size, + std::size_t nmemb, + void* context) +{ + auto ctx = reinterpret_cast(context); + std::size_t nbytes = size * nmemb; + if (ctx->size < ctx->offset + nbytes) { + std::cout << "callback_host_memory() - FAILED: " << ((void*)data) + << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset + << ", nbytes: " << nbytes << std::endl; + return CURL_WRITEFUNC_ERROR; + } + + // std::cout << "callback_host_memory() - data: " << ((void*)data) + // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset + // << ", nbytes: " << nbytes << std::endl; + + std::memcpy(ctx->buf + ctx->offset, data, nbytes); + ctx->offset += nbytes; + return nbytes; +} + +inline std::size_t callback_device_memory(char* data, + std::size_t size, + std::size_t nmemb, + void* context) +{ + auto ctx = reinterpret_cast(context); + std::size_t nbytes = size * nmemb; + if (ctx->size < ctx->offset + nbytes) { + std::cout << "callback_device_memory() - FAILED: " << ((void*)data) + << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset + << ", nbytes: " << nbytes << std::endl; + + return CURL_WRITEFUNC_ERROR; + } + + CUcontext cuda_ctx = get_context_from_pointer(ctx->buf); + PushAndPopContext c(cuda_ctx); + CUstream stream = detail::StreamsByThread::get(); + + // std::cout << "callback_device_memory() - data: " << ((void*)data) + // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset + // << ", nbytes: " << nbytes << std::endl; + + CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( + convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); + CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); + + ctx->offset += nbytes; + return nbytes; } + } // namespace detail /** @@ -187,8 +260,7 @@ class RemoteHandle { public: RemoteHandle(std::string url, std::size_t nbytes) : _url(std::move(url)), _nbytes{nbytes} { - auto curl = create_curl_handle(); - std::cout << "RemoteHandle() - nbytes: " << _nbytes << std::endl; + std::cout << "RemoteHandle(" << _url << ") - nbytes: " << _nbytes << std::endl; } RemoteHandle(std::string const& url) : RemoteHandle(url, detail::get_file_size(url)) {} @@ -206,6 +278,67 @@ class RemoteHandle { * @return The number of bytes. */ [[nodiscard]] std::size_t nbytes() const noexcept { return _nbytes; } + + /** + * @brief Read from remote source into buffer (host or device memory). + * + * @param buf Pointer to host or device memory. + * @param size Number of bytes to read. + * @param file_offset File offset in bytes. + * @return Number of bytes read, which is always `size`. + */ + std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0) + { + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::read()", size); + + auto curl = create_curl_handle(); + + curl.setopt(CURLOPT_URL, _url.c_str()); + curl.setopt(CURLOPT_FAILONERROR, 1L); + + std::string const byte_range = + std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); + curl.setopt(CURLOPT_RANGE, byte_range.c_str()); + + if (is_host_memory(buf)) { + curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_host_memory); + } else { + curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_device_memory); + } + detail::CallbackContext ctx{.buf = reinterpret_cast(buf), .size = size, .offset = 0}; + curl.setopt(CURLOPT_WRITEDATA, &ctx); + + // std::cout << "read() - buf: " << buf << ", byte_range: " << byte_range << std::endl; + curl.perform(); + return size; + } + + /** + * @brief Read from remote source into buffer (host or device memory) in parallel. + * + * This API is a parallel async version of `.read()` that partition the operation + * into tasks of size `task_size` for execution in the default thread pool. + * + * @param buf Pointer to host or device memory. + * @param size Number of bytes to read. + * @param file_offset File offset in bytes. + * @param task_size Size of each task in bytes. + * @return Number of bytes read, which is `size` always. + */ + std::future pread(void* buf, + std::size_t size, + std::size_t file_offset = 0, + std::size_t task_size = defaults::task_size()) + { + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::pread()", size); + auto task = [this](void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset) -> std::size_t { + return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); + }; + return parallel_io(task, buf, size, file_offset, task_size, 0); + } }; } // namespace kvikio diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index 8990eb8cf7..c348949647 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -7,11 +7,13 @@ from typing import Optional from cython.operator cimport dereference as deref +from libc.stdint cimport uintptr_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string +from libcpp.utility cimport pair -# from kvikio._lib.arr cimport parse_buffer_argument -# from kvikio._lib.future cimport IOFuture, _wrap_io_future, future +from kvikio._lib.arr cimport parse_buffer_argument +from kvikio._lib.future cimport IOFuture, _wrap_io_future, future cdef extern from "" nogil: @@ -19,7 +21,16 @@ cdef extern from "" nogil: cpp_RemoteHandle(string url, size_t nbytes) except + cpp_RemoteHandle(string url) except + int nbytes() except + - + size_t read( + void* buf, + size_t size, + size_t file_offset + ) except + + future[size_t] pread( + void* buf, + size_t size, + size_t file_offset + ) except + cdef class RemoteFile: cdef unique_ptr[cpp_RemoteHandle] _handle @@ -37,3 +48,21 @@ cdef class RemoteFile: def nbytes(self) -> int: return deref(self._handle).nbytes() + + def read(self, buf, size: Optional[int], file_offset: int) -> int: + cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) + return deref(self._handle).read( + info.first, + info.second, + file_offset, + ) + + def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture: + cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True) + return _wrap_io_future( + deref(self._handle).pread( + info.first, + info.second, + file_offset, + ) + ) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 4d895350bb..2fa0cb2c92 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -62,39 +62,39 @@ def nbytes(self) -> int: """ return self._handle.nbytes() - # def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: - # """Read from remote source into buffer (host or device memory) in parallel. - - # Parameters - # ---------- - # buf : buffer-like or array-like - # Device or host buffer to read into. - # size - # Size in bytes to read. - # file_offset - # Offset in the file to read from. - - # Returns - # ------- - # The size of bytes that were successfully read. - # """ - # return self.pread(buf, size, file_offset).get() - - # def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: - # """Read from remote source into buffer (host or device memory) in parallel. - - # Parameters - # ---------- - # buf : buffer-like or array-like - # Device or host buffer to read into. - # size - # Size in bytes to read. - # file_offset - # Offset in the file to read from. - - # Returns - # ------- - # Future that on completion returns the size of bytes that were successfully - # read. - # """ - # return IOFuture(self._handle.pread(buf, size, file_offset)) + def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int: + """Read from remote source into buffer (host or device memory) in parallel. + + Parameters + ---------- + buf : buffer-like or array-like + Device or host buffer to read into. + size + Size in bytes to read. + file_offset + Offset in the file to read from. + + Returns + ------- + The size of bytes that were successfully read. + """ + return self.pread(buf, size, file_offset).get() + + def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture: + """Read from remote source into buffer (host or device memory) in parallel. + + Parameters + ---------- + buf : buffer-like or array-like + Device or host buffer to read into. + size + Size in bytes to read. + file_offset + Offset in the file to read from. + + Returns + ------- + Future that on completion returns the size of bytes that were successfully + read. + """ + return IOFuture(self._handle.pread(buf, size, file_offset)) diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 9bc8d01dd2..69b1b6759a 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -6,10 +6,11 @@ import multiprocessing as mp import threading import time -from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer +from http.server import ThreadingHTTPServer import numpy as np import pytest +from RangeHTTPServer import RangeRequestHandler import kvikio import kvikio.defaults @@ -22,7 +23,7 @@ def start_http_server(queue: mp.Queue, tmpdir: str): httpd = ThreadingHTTPServer( - ("127.0.0.1", 0), functools.partial(SimpleHTTPRequestHandler, directory=tmpdir) + ("127.0.0.1", 0), functools.partial(RangeRequestHandler, directory=tmpdir) ) thread = threading.Thread(target=httpd.serve_forever) thread.start() @@ -34,7 +35,6 @@ def start_http_server(queue: mp.Queue, tmpdir: str): @pytest.fixture # (scope="session") def http_server(tmpdir): """Fixture to set up http server in separate process""" - print(str(tmpdir)) queue = mp.Queue() p = mp.Process(target=start_http_server, args=(queue, str(tmpdir))) p.start() @@ -48,3 +48,32 @@ def test_file_size(http_server, tmpdir): a.tofile(tmpdir / "a") with kvikio.RemoteFile(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes + + +@pytest.mark.parametrize("size", [10, 100, 1000]) +@pytest.mark.parametrize("nthreads", [1, 3]) +@pytest.mark.parametrize("tasksize", [99, 999]) +def test_read(http_server, tmpdir, xp, size, nthreads, tasksize): + a = xp.arange(size) + a.tofile(tmpdir / "a") + + with kvikio.defaults.set_num_threads(nthreads): + with kvikio.defaults.set_task_size(tasksize): + with kvikio.RemoteFile(f"{http_server}/a") as f: + assert f.nbytes() == a.nbytes + b = xp.empty_like(a) + assert f.read(buf=b) == a.nbytes + xp.testing.assert_array_equal(a, b) + + +@pytest.mark.parametrize("nthreads", [1, 10]) +def test_large_read(http_server, tmpdir, xp, nthreads): + a = xp.arange(16_000_000) + a.tofile(tmpdir / "a") + + with kvikio.defaults.set_num_threads(nthreads): + with kvikio.RemoteFile(f"{http_server}/a") as f: + assert f.nbytes() == a.nbytes + b = xp.empty_like(a) + assert f.read(buf=b) == a.nbytes + xp.testing.assert_array_equal(a, b) From 127b7825e33248e217f5ca6a2b34b940d3610415 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:11:18 +0200 Subject: [PATCH 03/64] ci: rangehttpserver --- conda/environments/all_cuda-118_arch-aarch64.yaml | 2 ++ conda/environments/all_cuda-118_arch-x86_64.yaml | 2 ++ conda/environments/all_cuda-125_arch-aarch64.yaml | 2 ++ conda/environments/all_cuda-125_arch-x86_64.yaml | 2 ++ dependencies.yaml | 2 ++ python/kvikio/pyproject.toml | 1 + python/kvikio/tests/test_http_io.py | 9 +++++---- 7 files changed, 16 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 65ca39fa34..bace9147c0 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -35,4 +35,6 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-aarch64=2.17 - zarr +- pip: + - rangehttpserver name: all_cuda-118_arch-aarch64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a020690e64..414bf2b6ce 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -37,4 +37,6 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-64=2.17 - zarr +- pip: + - rangehttpserver name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 31145241d7..bf5da07997 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -35,4 +35,6 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-aarch64=2.17 - zarr +- pip: + - rangehttpserver name: all_cuda-125_arch-aarch64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 4d7d0be7c6..717db4955a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -35,4 +35,6 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-64=2.17 - zarr +- pip: + - rangehttpserver name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 123112ac1a..a5f9681291 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -319,6 +319,8 @@ dependencies: - &dask dask>=2022.05.2 - pytest - pytest-cov + - pip: + - rangehttpserver specific: - output_types: [conda, requirements, pyproject] matrices: diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index 046e157a11..aa3e1142a0 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -42,6 +42,7 @@ test = [ "dask>=2022.05.2", "pytest", "pytest-cov", + {pip = ["rangehttpserver"]}, ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 69b1b6759a..f54679cb1f 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -6,7 +6,7 @@ import multiprocessing as mp import threading import time -from http.server import ThreadingHTTPServer +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer import numpy as np import pytest @@ -21,9 +21,10 @@ ) -def start_http_server(queue: mp.Queue, tmpdir: str): +def start_http_server(queue: mp.Queue, tmpdir: str, range_support: bool = True): + handler = RangeRequestHandler if range_support else SimpleHTTPRequestHandler httpd = ThreadingHTTPServer( - ("127.0.0.1", 0), functools.partial(RangeRequestHandler, directory=tmpdir) + ("127.0.0.1", 0), functools.partial(handler, directory=tmpdir) ) thread = threading.Thread(target=httpd.serve_forever) thread.start() @@ -32,7 +33,7 @@ def start_http_server(queue: mp.Queue, tmpdir: str): print("ThreadingHTTPServer shutting down because of timeout (60sec)") -@pytest.fixture # (scope="session") +@pytest.fixture def http_server(tmpdir): """Fixture to set up http server in separate process""" queue = mp.Queue() From 089397ff8876252a53c5e956fa34004f95c2c5e9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:21:00 +0200 Subject: [PATCH 04/64] libcurl>=7.75.0 # Need --- conda/environments/all_cuda-118_arch-aarch64.yaml | 1 + conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-125_arch-aarch64.yaml | 1 + conda/environments/all_cuda-125_arch-x86_64.yaml | 1 + conda/recipes/libkvikio/meta.yaml | 1 + dependencies.yaml | 1 + 6 files changed, 6 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index bace9147c0..ca1fd191e1 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -17,6 +17,7 @@ dependencies: - dask>=2022.05.2 - doxygen=1.9.1 - gcc_linux-aarch64=11.* +- libcurl>=7.75.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 414bf2b6ce..292d7f749f 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,6 +19,7 @@ dependencies: - gcc_linux-64=11.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 +- libcurl>=7.75.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index bf5da07997..2600db3c0b 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -18,6 +18,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-aarch64=11.* - libcufile-dev +- libcurl>=7.75.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 717db4955a..8d430b388a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -18,6 +18,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-64=11.* - libcufile-dev +- libcurl>=7.75.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 186c373f56..f3727fbdd0 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -52,6 +52,7 @@ requirements: {% else %} - libcufile-dev # [linux] {% endif %} + - libcurl>=7.75.0 outputs: - name: libkvikio diff --git a/dependencies.yaml b/dependencies.yaml index a5f9681291..2148b45e2a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -106,6 +106,7 @@ dependencies: packages: - c-compiler - cxx-compiler + - libcurl>=7.75.0 # Need specific: - output_types: conda matrices: From 5f1fca076c68631ba6a254f7d50e1102f1e3d46d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:23:41 +0200 Subject: [PATCH 05/64] cmake --- cpp/CMakeLists.txt | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 82abe38309..6b15512deb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -50,11 +50,13 @@ rapids_find_package( INSTALL_EXPORT_SET kvikio-exports ) -rapids_find_package( - CURL REQUIRED - BUILD_EXPORT_SET kvikio-exports - INSTALL_EXPORT_SET kvikio-exports -) +if(KvikIO_REMOTE_SUPPORT) + rapids_find_package( + CURL REQUIRED + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports + ) +endif() rapids_find_package( CUDAToolkit From eaacacfdfdaa88e26aaf9353482f3793737e1ed9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:32:39 +0200 Subject: [PATCH 06/64] fix ci rangehttpserver --- dependencies.yaml | 7 ++++++- python/kvikio/pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 2148b45e2a..4953f22a8f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -320,8 +320,13 @@ dependencies: - &dask dask>=2022.05.2 - pytest - pytest-cov + - output_types: [requirements, pyproject] + packages: + - rangehttpserver + - output_types: conda + packages: - pip: - - rangehttpserver + - rangehttpserver specific: - output_types: [conda, requirements, pyproject] matrices: diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml index aa3e1142a0..9bd86518ff 100644 --- a/python/kvikio/pyproject.toml +++ b/python/kvikio/pyproject.toml @@ -42,7 +42,7 @@ test = [ "dask>=2022.05.2", "pytest", "pytest-cov", - {pip = ["rangehttpserver"]}, + "rangehttpserver", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] From 59595a895bd6f76f7aa68851b32adf15d81d93f0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:34:39 +0200 Subject: [PATCH 07/64] cleanup --- cpp/examples/basic_io.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp index 5f0fc74a07..f5942dda9b 100644 --- a/cpp/examples/basic_io.cpp +++ b/cpp/examples/basic_io.cpp @@ -292,7 +292,4 @@ int main() check(res.check_bytes_done() == SIZE); cout << "File async read: " << res.check_bytes_done() << endl; } - { - kvikio::RemoteHandle("hej", 42); - } } From f8032206477cd92f2f91e8548500dac925d6e728 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 15:50:50 +0200 Subject: [PATCH 08/64] get_file_size(): check -1 --- cpp/include/kvikio/remote_handle.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 45e44c7303..aef8066a1d 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -184,6 +184,10 @@ inline std::size_t get_file_size(std::string url) curl_off_t cl; curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + if (cl < 0) { + throw std::runtime_error("cannot get size of " + url + + ", content-length not provided by the server"); + } return cl; } From dd06044650a8f90da9c60c41aa2fde48835333f9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 18 Sep 2024 16:07:29 +0200 Subject: [PATCH 09/64] Read(): check file size --- cpp/include/kvikio/remote_handle.hpp | 7 +++++++ python/kvikio/tests/test_http_io.py | 20 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index aef8066a1d..4db41436f5 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -295,6 +295,13 @@ class RemoteHandle { { KVIKIO_NVTX_FUNC_RANGE("RemoteHandle::read()", size); + if (file_offset + size > _nbytes) { + std::stringstream ss; + ss << "cannot read " << file_offset << "+" << size << " bytes into a " << _nbytes + << " bytes file (" << _url << ")"; + throw std::invalid_argument(ss.str()); + } + auto curl = create_curl_handle(); curl.setopt(CURLOPT_URL, _url.c_str()); diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index f54679cb1f..ee33908fd2 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -63,7 +63,7 @@ def test_read(http_server, tmpdir, xp, size, nthreads, tasksize): with kvikio.RemoteFile(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) - assert f.read(buf=b) == a.nbytes + assert f.read(b) == a.nbytes xp.testing.assert_array_equal(a, b) @@ -76,5 +76,21 @@ def test_large_read(http_server, tmpdir, xp, nthreads): with kvikio.RemoteFile(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) - assert f.read(buf=b) == a.nbytes + assert f.read(b) == a.nbytes xp.testing.assert_array_equal(a, b) + + +def test_error_too_small_file(http_server, tmpdir, xp): + a = xp.arange(10, dtype="uint8") + b = xp.empty(100, dtype="uint8") + a.tofile(tmpdir / "a") + with kvikio.RemoteFile(f"{http_server}/a") as f: + assert f.nbytes() == a.nbytes + with pytest.raises( + ValueError, match=r"cannot read 0\+100 bytes into a 10 bytes file" + ): + f.read(b) + with pytest.raises( + ValueError, match=r"cannot read 100\+5 bytes into a 10 bytes file" + ): + f.read(b, size=5, file_offset=100) From 06dcb35019b291825f47b6e347ce825b463fface Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 13:02:22 +0200 Subject: [PATCH 10/64] LibCurl get/set: use lock --- cpp/include/kvikio/remote_handle.hpp | 41 ++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 4db41436f5..25669f8fb2 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -52,6 +52,7 @@ namespace detail { */ class LibCurl { private: + std::mutex _mutex{}; std::stack _free_curl_handles{}; LibCurl() @@ -81,8 +82,6 @@ class LibCurl { LibCurl& operator=(LibCurl&& o) = delete; public: - void put(CURL* handle) { _free_curl_handles.push(handle); } - static LibCurl& instance() { static LibCurl _instance; @@ -92,17 +91,28 @@ class LibCurl { CURL* get() { // Check if we have a handle available. - if (!_free_curl_handles.empty()) { - CURL* ret = _free_curl_handles.top(); - _free_curl_handles.pop(); - curl_easy_reset(ret); - return ret; + CURL* ret = nullptr; + { + std::lock_guard const lock(_mutex); + if (!_free_curl_handles.empty()) { + ret = _free_curl_handles.top(); + _free_curl_handles.pop(); + } } // If not, we create a new handle. - CURL* ret = curl_easy_init(); - if (ret == nullptr) { throw std::runtime_error("libcurl: call to curl_easy_init() failed"); } + if (ret == nullptr) { + ret = curl_easy_init(); + if (ret == nullptr) { throw std::runtime_error("libcurl: call to curl_easy_init() failed"); } + } + curl_easy_reset(ret); return ret; } + + void put(CURL* handle) + { + std::lock_guard const lock(_mutex); + _free_curl_handles.push(handle); + } }; /** @@ -142,7 +152,17 @@ class CurlHandle { } void perform() { + // Need CURLOPT_NOSIGNAL to support threading, see + // + setopt(CURLOPT_NOSIGNAL, 1L); + + // We always set CURLOPT_ERRORBUFFER to get better error messages. setopt(CURLOPT_ERRORBUFFER, _errbuf); + + // Make curl_easy_perform() fail when receiving HTTP code errors. + setopt(CURLOPT_FAILONERROR, 1L); + + // Perform and check for errors. CURLcode err = curl_easy_perform(handle()); if (err != CURLE_OK) { std::string msg(_errbuf); @@ -156,6 +176,7 @@ class CurlHandle { throw std::runtime_error(ss.str()); } } + template void getinfo(INFO info, VALUE value) { @@ -179,7 +200,6 @@ inline std::size_t get_file_size(std::string url) curl.setopt(CURLOPT_URL, url.c_str()); curl.setopt(CURLOPT_NOBODY, 1L); - curl.setopt(CURLOPT_FAILONERROR, 1L); curl.perform(); curl_off_t cl; @@ -305,7 +325,6 @@ class RemoteHandle { auto curl = create_curl_handle(); curl.setopt(CURLOPT_URL, _url.c_str()); - curl.setopt(CURLOPT_FAILONERROR, 1L); std::string const byte_range = std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); From 7501600d109675b4d229433a5942f8b979608c4b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 13:27:11 +0200 Subject: [PATCH 11/64] test_no_range_support --- cpp/include/kvikio/remote_handle.hpp | 16 +++------------- python/kvikio/tests/test_http_io.py | 27 ++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 25669f8fb2..4de79e2145 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -173,6 +173,7 @@ class CurlHandle { } else { ss << "(" << msg << ")"; } + if (err == CURLE_WRITE_ERROR) { ss << "[maybe the server doesn't support file ranges?]"; } throw std::runtime_error(ss.str()); } } @@ -224,12 +225,7 @@ inline std::size_t callback_host_memory(char* data, { auto ctx = reinterpret_cast(context); std::size_t nbytes = size * nmemb; - if (ctx->size < ctx->offset + nbytes) { - std::cout << "callback_host_memory() - FAILED: " << ((void*)data) - << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset - << ", nbytes: " << nbytes << std::endl; - return CURL_WRITEFUNC_ERROR; - } + if (ctx->size < ctx->offset + nbytes) { return CURL_WRITEFUNC_ERROR; } // std::cout << "callback_host_memory() - data: " << ((void*)data) // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset @@ -247,13 +243,7 @@ inline std::size_t callback_device_memory(char* data, { auto ctx = reinterpret_cast(context); std::size_t nbytes = size * nmemb; - if (ctx->size < ctx->offset + nbytes) { - std::cout << "callback_device_memory() - FAILED: " << ((void*)data) - << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset - << ", nbytes: " << nbytes << std::endl; - - return CURL_WRITEFUNC_ERROR; - } + if (ctx->size < ctx->offset + nbytes) { return CURL_WRITEFUNC_ERROR; } CUcontext cuda_ctx = get_context_from_pointer(ctx->buf); PushAndPopContext c(cuda_ctx); diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index ee33908fd2..3edc9377d7 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -21,7 +21,7 @@ ) -def start_http_server(queue: mp.Queue, tmpdir: str, range_support: bool = True): +def start_http_server(queue: mp.Queue, tmpdir: str, range_support: bool): handler = RangeRequestHandler if range_support else SimpleHTTPRequestHandler httpd = ThreadingHTTPServer( ("127.0.0.1", 0), functools.partial(handler, directory=tmpdir) @@ -34,10 +34,14 @@ def start_http_server(queue: mp.Queue, tmpdir: str, range_support: bool = True): @pytest.fixture -def http_server(tmpdir): +def http_server(request, tmpdir): """Fixture to set up http server in separate process""" + range_support = True + if hasattr(request, "param"): + range_support = request.param.get("range_support", True) + queue = mp.Queue() - p = mp.Process(target=start_http_server, args=(queue, str(tmpdir))) + p = mp.Process(target=start_http_server, args=(queue, str(tmpdir), range_support)) p.start() ip, port = queue.get() yield f"http://{ip}:{port}" @@ -94,3 +98,20 @@ def test_error_too_small_file(http_server, tmpdir, xp): ValueError, match=r"cannot read 100\+5 bytes into a 10 bytes file" ): f.read(b, size=5, file_offset=100) + + +@pytest.mark.parametrize("http_server", [{"range_support": False}], indirect=True) +def test_no_range_support(http_server, tmpdir, xp): + a = xp.arange(100, dtype="uint8") + a.tofile(tmpdir / "a") + b = xp.empty_like(a) + with kvikio.RemoteFile(f"{http_server}/a") as f: + assert f.nbytes() == a.nbytes + with pytest.raises( + RuntimeError, match="maybe the server doesn't support file ranges?" + ): + f.read(b, size=10, file_offset=0) + with pytest.raises( + RuntimeError, match="maybe the server doesn't support file ranges?" + ): + f.read(b, size=10, file_offset=10) From 7e6315ea3d65afa1f18acd74d5a9aabfa29a5dbd Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 13:31:14 +0200 Subject: [PATCH 12/64] bump libcurl>=7.87.0 --- conda/environments/all_cuda-118_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/libkvikio/meta.yaml | 2 +- dependencies.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index ca1fd191e1..cee9bc495a 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -17,7 +17,7 @@ dependencies: - dask>=2022.05.2 - doxygen=1.9.1 - gcc_linux-aarch64=11.* -- libcurl>=7.75.0 +- libcurl>=7.87.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 292d7f749f..6aa0a40289 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - gcc_linux-64=11.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 -- libcurl>=7.75.0 +- libcurl>=7.87.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 2600db3c0b..321431dbed 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -18,7 +18,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-aarch64=11.* - libcufile-dev -- libcurl>=7.75.0 +- libcurl>=7.87.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 8d430b388a..5123bc012d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -18,7 +18,7 @@ dependencies: - doxygen=1.9.1 - gcc_linux-64=11.* - libcufile-dev -- libcurl>=7.75.0 +- libcurl>=7.87.0 - ninja - numcodecs !=0.12.0 - numpy>=1.23,<3.0a0 diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index f3727fbdd0..85fa9068e7 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -52,7 +52,7 @@ requirements: {% else %} - libcufile-dev # [linux] {% endif %} - - libcurl>=7.75.0 + - libcurl>=7.87.0 outputs: - name: libkvikio diff --git a/dependencies.yaml b/dependencies.yaml index 4953f22a8f..5ab098e2f5 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -106,7 +106,7 @@ dependencies: packages: - c-compiler - cxx-compiler - - libcurl>=7.75.0 # Need + - libcurl>=7.87.0 # Need specific: - output_types: conda matrices: From 4f9ab63df49699ef241ff11f12d43b22f9526bb8 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 13:36:42 +0200 Subject: [PATCH 13/64] ci: more libcurl>=7.87.0 --- conda/recipes/libkvikio/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 85fa9068e7..233bbd4727 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -84,6 +84,7 @@ outputs: {% else %} - libcufile-dev # [linux] {% endif %} + - libcurl>=7.87.0 test: commands: - test -f $PREFIX/include/kvikio/file_handle.hpp From a20774c4400560ee3628f060a4716f88002a7874 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 13:54:17 +0200 Subject: [PATCH 14/64] cmake: CURL 7.87.0 --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6b15512deb..8f464ba5d5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,7 +52,7 @@ rapids_find_package( if(KvikIO_REMOTE_SUPPORT) rapids_find_package( - CURL REQUIRED + CURL 7.87.0 REQUIRED BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports ) From 43421edf7ff45ccf7d2d7e3a9fd28afac663f209 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 14:08:23 +0200 Subject: [PATCH 15/64] ci_debug --- cpp/include/kvikio/remote_handle.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 4de79e2145..bde743ce42 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -174,6 +174,7 @@ class CurlHandle { ss << "(" << msg << ")"; } if (err == CURLE_WRITE_ERROR) { ss << "[maybe the server doesn't support file ranges?]"; } + std::cout << "perform() - error: " << ss.str() << std::endl; throw std::runtime_error(ss.str()); } } From 7df0e9f9490bca12869c6fc6c7a337aeebc92dca Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 14:22:45 +0200 Subject: [PATCH 16/64] CURLOPT_FOLLOWLOCATION --- cpp/include/kvikio/remote_handle.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index bde743ce42..e219a06f5c 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -202,6 +202,7 @@ inline std::size_t get_file_size(std::string url) curl.setopt(CURLOPT_URL, url.c_str()); curl.setopt(CURLOPT_NOBODY, 1L); + curl.setopt(CURLOPT_FOLLOWLOCATION, 1L); curl.perform(); curl_off_t cl; From 40b4be35caa3643df0ac989ac703e49e02bd6ddd Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 19 Sep 2024 14:50:34 +0200 Subject: [PATCH 17/64] overflow_error --- cpp/include/kvikio/remote_handle.hpp | 29 +++++++++++++++++++++------- python/kvikio/tests/test_http_io.py | 4 ++-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index e219a06f5c..32362266aa 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -162,7 +162,7 @@ class CurlHandle { // Make curl_easy_perform() fail when receiving HTTP code errors. setopt(CURLOPT_FAILONERROR, 1L); - // Perform and check for errors. + // Perform the curl operation and check for errors. CURLcode err = curl_easy_perform(handle()); if (err != CURLE_OK) { std::string msg(_errbuf); @@ -173,8 +173,6 @@ class CurlHandle { } else { ss << "(" << msg << ")"; } - if (err == CURLE_WRITE_ERROR) { ss << "[maybe the server doesn't support file ranges?]"; } - std::cout << "perform() - error: " << ss.str() << std::endl; throw std::runtime_error(ss.str()); } } @@ -218,6 +216,7 @@ struct CallbackContext { char* buf; std::size_t size; std::size_t offset; + bool overflow_error; }; inline std::size_t callback_host_memory(char* data, @@ -227,7 +226,10 @@ inline std::size_t callback_host_memory(char* data, { auto ctx = reinterpret_cast(context); std::size_t nbytes = size * nmemb; - if (ctx->size < ctx->offset + nbytes) { return CURL_WRITEFUNC_ERROR; } + if (ctx->size < ctx->offset + nbytes) { + ctx->overflow_error = true; + return CURL_WRITEFUNC_ERROR; + } // std::cout << "callback_host_memory() - data: " << ((void*)data) // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset @@ -245,7 +247,10 @@ inline std::size_t callback_device_memory(char* data, { auto ctx = reinterpret_cast(context); std::size_t nbytes = size * nmemb; - if (ctx->size < ctx->offset + nbytes) { return CURL_WRITEFUNC_ERROR; } + if (ctx->size < ctx->offset + nbytes) { + ctx->overflow_error = true; + return CURL_WRITEFUNC_ERROR; + } CUcontext cuda_ctx = get_context_from_pointer(ctx->buf); PushAndPopContext c(cuda_ctx); @@ -327,11 +332,21 @@ class RemoteHandle { } else { curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_device_memory); } - detail::CallbackContext ctx{.buf = reinterpret_cast(buf), .size = size, .offset = 0}; + detail::CallbackContext ctx{ + .buf = reinterpret_cast(buf), .size = size, .offset = 0, .overflow_error = false}; curl.setopt(CURLOPT_WRITEDATA, &ctx); // std::cout << "read() - buf: " << buf << ", byte_range: " << byte_range << std::endl; - curl.perform(); + try { + curl.perform(); + } catch (std::runtime_error const& e) { + if (ctx.overflow_error) { + std::stringstream ss; + ss << "maybe the server doesn't support file ranges? [" << e.what() << "]"; + throw std::overflow_error(ss.str()); + } + throw; + } return size; } diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 3edc9377d7..9b510d042e 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -108,10 +108,10 @@ def test_no_range_support(http_server, tmpdir, xp): with kvikio.RemoteFile(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes with pytest.raises( - RuntimeError, match="maybe the server doesn't support file ranges?" + OverflowError, match="maybe the server doesn't support file ranges?" ): f.read(b, size=10, file_offset=0) with pytest.raises( - RuntimeError, match="maybe the server doesn't support file ranges?" + OverflowError, match="maybe the server doesn't support file ranges?" ): f.read(b, size=10, file_offset=10) From 943e30b623c63e4abe4a19962cb5b9476ed0319a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 20 Sep 2024 10:55:35 +0200 Subject: [PATCH 18/64] RemoteEndpoint --- cpp/examples/basic_io.cpp | 1 - cpp/include/kvikio/remote_handle.hpp | 77 ++++++++++++++------- python/kvikio/kvikio/_lib/remote_handle.pyx | 15 ++-- 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp index f5942dda9b..3a4ab892ad 100644 --- a/cpp/examples/basic_io.cpp +++ b/cpp/examples/basic_io.cpp @@ -25,7 +25,6 @@ #include #include #include -#include using namespace std; diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 32362266aa..8bde77fd5e 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -20,6 +20,7 @@ #endif #include +#include #include #include #include @@ -194,24 +195,6 @@ class CurlHandle { kvikio::detail::CurlHandle( \ kvikio::detail::LibCurl::instance().get(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) -inline std::size_t get_file_size(std::string url) -{ - auto curl = create_curl_handle(); - - curl.setopt(CURLOPT_URL, url.c_str()); - curl.setopt(CURLOPT_NOBODY, 1L); - curl.setopt(CURLOPT_FOLLOWLOCATION, 1L); - curl.perform(); - - curl_off_t cl; - curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); - if (cl < 0) { - throw std::runtime_error("cannot get size of " + url + - ", content-length not provided by the server"); - } - return cl; -} - struct CallbackContext { char* buf; std::size_t size; @@ -270,21 +253,66 @@ inline std::size_t callback_device_memory(char* data, } // namespace detail +/** + * @brief + */ +class RemoteEndpoint { + public: + RemoteEndpoint() {} + virtual void setopt(detail::CurlHandle& curl) = 0; + virtual std::string str() = 0; +}; + +/** + * @brief + */ +class HttpEndpoint : public RemoteEndpoint { + private: + std::string _url; + + public: + HttpEndpoint() = default; + HttpEndpoint(std::string url) : _url{std::move(url)} {} + void setopt(detail::CurlHandle& curl) override { curl.setopt(CURLOPT_URL, _url.c_str()); } + std::string str() override { return _url; } +}; + /** * @brief Handle of remote file. */ class RemoteHandle { private: - std::string _url; + std::unique_ptr _endpoint; std::size_t _nbytes; public: - RemoteHandle(std::string url, std::size_t nbytes) : _url(std::move(url)), _nbytes{nbytes} + RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) + : _endpoint{std::move(endpoint)}, _nbytes{nbytes} { - std::cout << "RemoteHandle(" << _url << ") - nbytes: " << _nbytes << std::endl; + std::cout << "RemoteHandle1() - endpoint: " << _endpoint->str() << ", nbytes: " << _nbytes + << std::endl; } + RemoteHandle(std::unique_ptr endpoint) + { + auto curl = create_curl_handle(); + + endpoint->setopt(curl); + curl.setopt(CURLOPT_NOBODY, 1L); + curl.setopt(CURLOPT_FOLLOWLOCATION, 1L); + curl.perform(); + curl_off_t cl; + curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + if (cl < 0) { + throw std::runtime_error("cannot get size of " + endpoint->str() + + ", content-length not provided by the server"); + } - RemoteHandle(std::string const& url) : RemoteHandle(url, detail::get_file_size(url)) {} + _nbytes = cl; + _endpoint = std::move(endpoint); + + std::cout << "RemoteHandle2() - endpoint: " << _endpoint->str() << ", nbytes: " << _nbytes + << std::endl; + } RemoteHandle(RemoteHandle const&) = delete; RemoteHandle& operator=(RemoteHandle const&) = delete; @@ -315,13 +343,12 @@ class RemoteHandle { if (file_offset + size > _nbytes) { std::stringstream ss; ss << "cannot read " << file_offset << "+" << size << " bytes into a " << _nbytes - << " bytes file (" << _url << ")"; + << " bytes file (" << _endpoint->str() << ")"; throw std::invalid_argument(ss.str()); } auto curl = create_curl_handle(); - - curl.setopt(CURLOPT_URL, _url.c_str()); + _endpoint->setopt(curl); std::string const byte_range = std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index c348949647..bcc0f43347 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -17,9 +17,12 @@ from kvikio._lib.future cimport IOFuture, _wrap_io_future, future cdef extern from "" nogil: + cdef cppclass cpp_HttpEndpoint "kvikio::HttpEndpoint": + cpp_HttpEndpoint(string url) except + + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": - cpp_RemoteHandle(string url, size_t nbytes) except + - cpp_RemoteHandle(string url) except + + cpp_RemoteHandle(cpp_HttpEndpoint endpoint, size_t nbytes) except + + cpp_RemoteHandle(cpp_HttpEndpoint endpoint) except + int nbytes() except + size_t read( void* buf, @@ -40,10 +43,14 @@ cdef class RemoteFile: cdef RemoteFile ret = RemoteFile() cdef string u = str.encode(str(url)) if nbytes is None: - ret._handle = make_unique[cpp_RemoteHandle](u) + ret._handle = make_unique[cpp_RemoteHandle]( + make_unique[cpp_HttpEndpoint](u) + ) return ret cdef size_t n = nbytes - ret._handle = make_unique[cpp_RemoteHandle](u, n) + ret._handle = make_unique[cpp_RemoteHandle]( + make_unique[cpp_HttpEndpoint](u), n + ) return ret def nbytes(self) -> int: From e8f6b12ab8e6d63c83c24942b061142190b2f23d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 20 Sep 2024 14:43:46 +0200 Subject: [PATCH 19/64] cpp_RemoteEndpoint --- python/kvikio/kvikio/_lib/remote_handle.pyx | 36 ++++++++++++++------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index bcc0f43347..4d0cd052f3 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -10,19 +10,24 @@ from cython.operator cimport dereference as deref from libc.stdint cimport uintptr_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string -from libcpp.utility cimport pair +from libcpp.utility cimport move, pair from kvikio._lib.arr cimport parse_buffer_argument from kvikio._lib.future cimport IOFuture, _wrap_io_future, future cdef extern from "" nogil: + cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint": + pass + cdef cppclass cpp_HttpEndpoint "kvikio::HttpEndpoint": cpp_HttpEndpoint(string url) except + cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle": - cpp_RemoteHandle(cpp_HttpEndpoint endpoint, size_t nbytes) except + - cpp_RemoteHandle(cpp_HttpEndpoint endpoint) except + + cpp_RemoteHandle( + unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes + ) except + + cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except + int nbytes() except + size_t read( void* buf, @@ -35,22 +40,31 @@ cdef extern from "" nogil: size_t file_offset ) except + +cdef string _to_string(str_or_none): + """Convert Python object to a C++ string (if None, return the empty string)""" + if str_or_none is None: + return string() + return str.encode(str(str_or_none)) + + cdef class RemoteFile: cdef unique_ptr[cpp_RemoteHandle] _handle @classmethod - def from_url(cls, url: str, nbytes: Optional[int]): + def from_url( + cls, + url: str, + nbytes: Optional[int], + ): cdef RemoteFile ret = RemoteFile() - cdef string u = str.encode(str(url)) + cdef unique_ptr[cpp_HttpEndpoint] ep = make_unique[cpp_HttpEndpoint]( + _to_string(url) + ) if nbytes is None: - ret._handle = make_unique[cpp_RemoteHandle]( - make_unique[cpp_HttpEndpoint](u) - ) + ret._handle = make_unique[cpp_RemoteHandle](move(ep)) return ret cdef size_t n = nbytes - ret._handle = make_unique[cpp_RemoteHandle]( - make_unique[cpp_HttpEndpoint](u), n - ) + ret._handle = make_unique[cpp_RemoteHandle](move(ep), n) return ret def nbytes(self) -> int: From 5f736772116d56ab5414177e516b14c46fd735a1 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 20 Sep 2024 14:57:21 +0200 Subject: [PATCH 20/64] from_http_url --- python/kvikio/kvikio/remote_file.py | 30 +++++++++++++++++++++++++---- python/kvikio/tests/test_http_io.py | 10 +++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 2fa0cb2c92..90a8b81fb4 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -33,17 +33,39 @@ def _get_remote_module(): class RemoteFile: - """File handle of a remote file (currently, only AWS S3 is supported).""" + """File handle of a remote file.""" - def __init__(self, url: str, nbytes: Optional[int] = None): - """Open a remote file given a bucket and object name. + def __init__(self, handle): + """Create a remote file from a Cython handle. + + This constructor should not be called directly instead use a + factory method like `RemoteFile.from_http_url()` + + Parameters + ---------- + handle : kvikio._lib.remote_handle.RemoteFile + The Cython handle + """ + assert isinstance(handle, _get_remote_module().RemoteFile) + self._handle = handle + + @classmethod + def from_http_url( + cls, + url: str, + nbytes: Optional[int] = None, + ) -> RemoteFile: + """Open a http file. Parameters ---------- url URL to the remote file. + nbytes + The size of the file. If None, KvikIO will ask the server + for the file size. """ - self._handle = _get_remote_module().RemoteFile.from_url(url, nbytes) + return RemoteFile(_get_remote_module().RemoteFile.from_url(url, nbytes)) def __enter__(self) -> RemoteFile: return self diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 9b510d042e..6bfb34c93a 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -51,7 +51,7 @@ def http_server(request, tmpdir): def test_file_size(http_server, tmpdir): a = np.arange(100) a.tofile(tmpdir / "a") - with kvikio.RemoteFile(f"{http_server}/a") as f: + with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes @@ -64,7 +64,7 @@ def test_read(http_server, tmpdir, xp, size, nthreads, tasksize): with kvikio.defaults.set_num_threads(nthreads): with kvikio.defaults.set_task_size(tasksize): - with kvikio.RemoteFile(f"{http_server}/a") as f: + with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) assert f.read(b) == a.nbytes @@ -77,7 +77,7 @@ def test_large_read(http_server, tmpdir, xp, nthreads): a.tofile(tmpdir / "a") with kvikio.defaults.set_num_threads(nthreads): - with kvikio.RemoteFile(f"{http_server}/a") as f: + with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) assert f.read(b) == a.nbytes @@ -88,7 +88,7 @@ def test_error_too_small_file(http_server, tmpdir, xp): a = xp.arange(10, dtype="uint8") b = xp.empty(100, dtype="uint8") a.tofile(tmpdir / "a") - with kvikio.RemoteFile(f"{http_server}/a") as f: + with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes with pytest.raises( ValueError, match=r"cannot read 0\+100 bytes into a 10 bytes file" @@ -105,7 +105,7 @@ def test_no_range_support(http_server, tmpdir, xp): a = xp.arange(100, dtype="uint8") a.tofile(tmpdir / "a") b = xp.empty_like(a) - with kvikio.RemoteFile(f"{http_server}/a") as f: + with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes with pytest.raises( OverflowError, match="maybe the server doesn't support file ranges?" From 5dbe2e5ee6a798ea2b02af5a12c8cf21d3b93c5c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 12:59:24 +0200 Subject: [PATCH 21/64] Update cpp/include/kvikio/remote_handle.hpp Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/remote_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 8bde77fd5e..8fbadc02bf 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -380,7 +380,7 @@ class RemoteHandle { /** * @brief Read from remote source into buffer (host or device memory) in parallel. * - * This API is a parallel async version of `.read()` that partition the operation + * This API is a parallel async version of `.read()` that partitions the operation * into tasks of size `task_size` for execution in the default thread pool. * * @param buf Pointer to host or device memory. From f9873c09f931a46e27c054209948933d6813169b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 12:58:27 +0200 Subject: [PATCH 22/64] setopt(): use CURLoption --- cpp/include/kvikio/remote_handle.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 8fbadc02bf..e86e82afec 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -140,8 +140,8 @@ class CurlHandle { CURL* handle() noexcept { return _handle; } - template - void setopt(OPT option, VAL value) + template + void setopt(CURLoption option, VAL value) { CURLcode err = curl_easy_setopt(handle(), option, value); if (err != CURLE_OK) { From 489645219d91940d7f61d7b19b865ecb5c939d2c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 13:01:29 +0200 Subject: [PATCH 23/64] comment: need CURL_WRITEFUNC_ERROR --- dependencies.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies.yaml b/dependencies.yaml index 5ab098e2f5..452dd5f1e5 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -106,7 +106,7 @@ dependencies: packages: - c-compiler - cxx-compiler - - libcurl>=7.87.0 # Need + - libcurl>=7.87.0 # Need CURL_WRITEFUNC_ERROR specific: - output_types: conda matrices: From e53b761724ed7e63de908c4f67ac4bfe0c224751 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 13:57:42 +0200 Subject: [PATCH 24/64] LibCurl: use vector --- cpp/include/kvikio/remote_handle.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index e86e82afec..90e00c1a8f 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -22,9 +22,9 @@ #include #include #include -#include #include #include +#include #include @@ -54,7 +54,7 @@ namespace detail { class LibCurl { private: std::mutex _mutex{}; - std::stack _free_curl_handles{}; + std::vector _free_curl_handles{}; LibCurl() { @@ -70,10 +70,10 @@ class LibCurl { ~LibCurl() noexcept { // clean up all retained easy handles - while (!_free_curl_handles.empty()) { - curl_easy_cleanup(_free_curl_handles.top()); - _free_curl_handles.pop(); + for (auto h : _free_curl_handles) { + curl_easy_cleanup(h); } + _free_curl_handles.clear(); curl_global_cleanup(); } @@ -96,8 +96,8 @@ class LibCurl { { std::lock_guard const lock(_mutex); if (!_free_curl_handles.empty()) { - ret = _free_curl_handles.top(); - _free_curl_handles.pop(); + ret = _free_curl_handles.back(); + _free_curl_handles.pop_back(); } } // If not, we create a new handle. @@ -112,7 +112,7 @@ class LibCurl { void put(CURL* handle) { std::lock_guard const lock(_mutex); - _free_curl_handles.push(handle); + _free_curl_handles.push_back(handle); } }; From 4ba4eb458cbb09bbf661c1cc6e284aec650a9fea Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 16:43:34 +0200 Subject: [PATCH 25/64] UniqueHandlePtr --- cpp/include/kvikio/remote_handle.hpp | 68 +++++++++++++++++----------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 90e00c1a8f..a852c5cd3e 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -19,7 +19,9 @@ #error "cannot include , configuration did not find libcurl" #endif +#include #include +#include #include #include #include @@ -52,9 +54,14 @@ namespace detail { * that time and it could cause a deadlock.) */ class LibCurl { + public: + // We hold an unique pointer to the raw curl handle and sets `curl_easy_cleanup` as its Deleter. + using UniqueHandlePtr = std::unique_ptr>; + private: std::mutex _mutex{}; - std::vector _free_curl_handles{}; + // Curl handles free to be used. + std::vector _free_curl_handles{}; LibCurl() { @@ -69,10 +76,6 @@ class LibCurl { } ~LibCurl() noexcept { - // clean up all retained easy handles - for (auto h : _free_curl_handles) { - curl_easy_cleanup(h); - } _free_curl_handles.clear(); curl_global_cleanup(); } @@ -89,30 +92,41 @@ class LibCurl { return _instance; } - CURL* get() + /** + * @brief Return a free curl handle if available + */ + UniqueHandlePtr get_free_handle() { - // Check if we have a handle available. - CURL* ret = nullptr; - { - std::lock_guard const lock(_mutex); - if (!_free_curl_handles.empty()) { - ret = _free_curl_handles.back(); - _free_curl_handles.pop_back(); - } + UniqueHandlePtr ret; + std::lock_guard const lock(_mutex); + if (!_free_curl_handles.empty()) { + ret = std::move(_free_curl_handles.back()); + _free_curl_handles.pop_back(); } - // If not, we create a new handle. - if (ret == nullptr) { - ret = curl_easy_init(); - if (ret == nullptr) { throw std::runtime_error("libcurl: call to curl_easy_init() failed"); } + return ret; + } + + UniqueHandlePtr get() + { + // Check if we have a free handle available. + UniqueHandlePtr ret = get_free_handle(); + if (ret) { + curl_easy_reset(ret.get()); + } else { + // If not, we create a new handle. + CURL* raw_handle = curl_easy_init(); + if (raw_handle == nullptr) { + throw std::runtime_error("libcurl: call to curl_easy_init() failed"); + } + ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup); } - curl_easy_reset(ret); return ret; } - void put(CURL* handle) + void put(UniqueHandlePtr handle) { std::lock_guard const lock(_mutex); - _free_curl_handles.push_back(handle); + _free_curl_handles.push_back(std::move(handle)); } }; @@ -122,23 +136,25 @@ class LibCurl { class CurlHandle { private: char _errbuf[CURL_ERROR_SIZE]; - CURL* _handle; + LibCurl::UniqueHandlePtr _handle; std::string _source_file; std::string _source_line; public: - CurlHandle(CURL* handle, std::string source_file, std::string source_line) - : _handle{handle}, _source_file(std::move(source_file)), _source_line(std::move(source_line)) + CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line) + : _handle{std::move(handle)}, + _source_file(std::move(source_file)), + _source_line(std::move(source_line)) { } - ~CurlHandle() noexcept { detail::LibCurl::instance().put(_handle); } + ~CurlHandle() noexcept { detail::LibCurl::instance().put(std::move(_handle)); } CurlHandle(CurlHandle const&) = delete; CurlHandle& operator=(CurlHandle const&) = delete; CurlHandle(CurlHandle&& o) = delete; CurlHandle& operator=(CurlHandle&& o) = delete; - CURL* handle() noexcept { return _handle; } + CURL* handle() noexcept { return _handle.get(); } template void setopt(CURLoption option, VAL value) From bf716be98ec1824ad9fafb6ecc9476916082b94e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 16:49:09 +0200 Subject: [PATCH 26/64] get_handle() / retain_handle() --- cpp/include/kvikio/remote_handle.hpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index a852c5cd3e..f2ea4d1b9f 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -93,7 +93,7 @@ class LibCurl { } /** - * @brief Return a free curl handle if available + * @brief Returns a free curl handle if available. */ UniqueHandlePtr get_free_handle() { @@ -106,7 +106,10 @@ class LibCurl { return ret; } - UniqueHandlePtr get() + /** + * @brief Returns a curl handle, create a new handle if none is available. + */ + UniqueHandlePtr get_handle() { // Check if we have a free handle available. UniqueHandlePtr ret = get_free_handle(); @@ -123,7 +126,10 @@ class LibCurl { return ret; } - void put(UniqueHandlePtr handle) + /** + * @brief Retain a curl handle for later use. + */ + void retain_handle(UniqueHandlePtr handle) { std::lock_guard const lock(_mutex); _free_curl_handles.push_back(std::move(handle)); @@ -147,7 +153,7 @@ class CurlHandle { _source_line(std::move(source_line)) { } - ~CurlHandle() noexcept { detail::LibCurl::instance().put(std::move(_handle)); } + ~CurlHandle() noexcept { detail::LibCurl::instance().retain_handle(std::move(_handle)); } CurlHandle(CurlHandle const&) = delete; CurlHandle& operator=(CurlHandle const&) = delete; @@ -209,7 +215,7 @@ class CurlHandle { #define create_curl_handle() \ kvikio::detail::CurlHandle( \ - kvikio::detail::LibCurl::instance().get(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) + kvikio::detail::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) struct CallbackContext { char* buf; From 83774e322a4e4848402fb71ca0184c240bf7a01f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 16:53:03 +0200 Subject: [PATCH 27/64] clean up --- cpp/include/kvikio/remote_handle.hpp | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index f2ea4d1b9f..be3276a98d 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -137,7 +137,7 @@ class LibCurl { }; /** - * @brief A wrapper of a curl easy handle pointer `CURL*`. + * @brief A wrapper of a curl easy handle pointer. */ class CurlHandle { private: @@ -152,6 +152,15 @@ class CurlHandle { _source_file(std::move(source_file)), _source_line(std::move(source_line)) { + // Need CURLOPT_NOSIGNAL to support threading, see + // + setopt(CURLOPT_NOSIGNAL, 1L); + + // We always set CURLOPT_ERRORBUFFER to get better error messages. + setopt(CURLOPT_ERRORBUFFER, _errbuf); + + // Make curl_easy_perform() fail when receiving HTTP code errors. + setopt(CURLOPT_FAILONERROR, 1L); } ~CurlHandle() noexcept { detail::LibCurl::instance().retain_handle(std::move(_handle)); } @@ -175,16 +184,6 @@ class CurlHandle { } void perform() { - // Need CURLOPT_NOSIGNAL to support threading, see - // - setopt(CURLOPT_NOSIGNAL, 1L); - - // We always set CURLOPT_ERRORBUFFER to get better error messages. - setopt(CURLOPT_ERRORBUFFER, _errbuf); - - // Make curl_easy_perform() fail when receiving HTTP code errors. - setopt(CURLOPT_FAILONERROR, 1L); - // Perform the curl operation and check for errors. CURLcode err = curl_easy_perform(handle()); if (err != CURLE_OK) { @@ -311,8 +310,6 @@ class RemoteHandle { RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) : _endpoint{std::move(endpoint)}, _nbytes{nbytes} { - std::cout << "RemoteHandle1() - endpoint: " << _endpoint->str() << ", nbytes: " << _nbytes - << std::endl; } RemoteHandle(std::unique_ptr endpoint) { @@ -331,9 +328,6 @@ class RemoteHandle { _nbytes = cl; _endpoint = std::move(endpoint); - - std::cout << "RemoteHandle2() - endpoint: " << _endpoint->str() << ", nbytes: " << _nbytes - << std::endl; } RemoteHandle(RemoteHandle const&) = delete; From e884d3c44935d1542d11cfb68fe3696fb5244fae Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 23 Sep 2024 16:55:17 +0200 Subject: [PATCH 28/64] doc --- cpp/include/kvikio/remote_handle.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index be3276a98d..2ca4e6b900 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -266,6 +266,7 @@ inline std::size_t callback_device_memory(char* data, CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); + // We have to sync since curl moght overwrite or free `data`. CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); ctx->offset += nbytes; From 66370893798254425ab964795359e7ca184e041d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 24 Sep 2024 11:00:47 +0200 Subject: [PATCH 29/64] utils: LocalHttpServer --- python/kvikio/kvikio/utils.py | 58 +++++++++++++++++++++++++++++ python/kvikio/tests/test_http_io.py | 28 ++------------ 2 files changed, 61 insertions(+), 25 deletions(-) create mode 100644 python/kvikio/kvikio/utils.py diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py new file mode 100644 index 0000000000..d7229c2959 --- /dev/null +++ b/python/kvikio/kvikio/utils.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import functools +import multiprocessing +import threading +import time + + +class LocalHttpServer: + """Local http server - slow but convenient""" + + @staticmethod + def _server( + queue: multiprocessing.Queue, + root_path: str, + range_support: bool, + max_lifetime: int, + ): + from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer + + if range_support: + from RangeHTTPServer import RangeRequestHandler + + handler = RangeRequestHandler + else: + handler = SimpleHTTPRequestHandler + httpd = ThreadingHTTPServer( + ("127.0.0.1", 0), functools.partial(handler, directory=root_path) + ) + thread = threading.Thread(target=httpd.serve_forever) + thread.start() + queue.put(httpd.server_address) + time.sleep(max_lifetime) + print( + f"ThreadingHTTPServer shutting down because of timeout ({max_lifetime}sec)" + ) + + def __init__(self, root_path: str, range_support: bool, max_lifetime: int) -> None: + self.root_path = root_path + self.range_support = range_support + self.max_lifetime = max_lifetime + + def __enter__(self): + queue = multiprocessing.Queue() + self.process = multiprocessing.Process( + target=LocalHttpServer._server, + args=(queue, str(self.root_path), self.range_support, self.max_lifetime), + ) + self.process.start() + ip, port = queue.get() + self.ip = ip + self.port = port + self.url = f"http://{ip}:{port}" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.process.kill() diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 6bfb34c93a..54d2d88f1d 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -2,18 +2,12 @@ # See file LICENSE for terms. -import functools -import multiprocessing as mp -import threading -import time -from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer - import numpy as np import pytest -from RangeHTTPServer import RangeRequestHandler import kvikio import kvikio.defaults +from kvikio.utils import LocalHttpServer pytestmark = pytest.mark.skipif( not kvikio.is_remote_file_available(), @@ -21,18 +15,6 @@ ) -def start_http_server(queue: mp.Queue, tmpdir: str, range_support: bool): - handler = RangeRequestHandler if range_support else SimpleHTTPRequestHandler - httpd = ThreadingHTTPServer( - ("127.0.0.1", 0), functools.partial(handler, directory=tmpdir) - ) - thread = threading.Thread(target=httpd.serve_forever) - thread.start() - queue.put(httpd.server_address) - time.sleep(60) - print("ThreadingHTTPServer shutting down because of timeout (60sec)") - - @pytest.fixture def http_server(request, tmpdir): """Fixture to set up http server in separate process""" @@ -40,12 +22,8 @@ def http_server(request, tmpdir): if hasattr(request, "param"): range_support = request.param.get("range_support", True) - queue = mp.Queue() - p = mp.Process(target=start_http_server, args=(queue, str(tmpdir), range_support)) - p.start() - ip, port = queue.get() - yield f"http://{ip}:{port}" - p.kill() + with LocalHttpServer(tmpdir, range_support, max_lifetime=60) as server: + yield server.url def test_file_size(http_server, tmpdir): From a0098e9f25987cbc76638ed99dca03cae12692d0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 24 Sep 2024 15:00:42 +0200 Subject: [PATCH 30/64] http benchmark --- python/kvikio/kvikio/benchmarks/http_io.py | 174 +++++++++++++++++++++ python/kvikio/kvikio/utils.py | 5 +- 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 python/kvikio/kvikio/benchmarks/http_io.py diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py new file mode 100644 index 0000000000..1234c28304 --- /dev/null +++ b/python/kvikio/kvikio/benchmarks/http_io.py @@ -0,0 +1,174 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import argparse +import contextlib +import pathlib +import statistics +import tempfile +import time +from functools import partial + +import cupy +import numpy +from dask.utils import format_bytes + +import kvikio +import kvikio.defaults +from kvikio.utils import LocalHttpServer + + +def run_numpy_like(args, xp): + src = numpy.arange(args.nelem, dtype=args.dtype) + src.tofile(args.server_root_path / "data") + dst = xp.empty_like(src) + url = f"{args.server_url}/data" + + def run() -> float: + t0 = time.perf_counter() + with kvikio.RemoteFile.from_http_url(url, nbytes=src.nbytes) as f: + res = f.read(dst) + t1 = time.perf_counter() + assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}" + xp.testing.assert_array_equal(src, dst) + return t1 - t0 + + for _ in range(args.nruns): + yield run() + + +API = { + "cupy-kvikio": partial(run_numpy_like, xp=cupy), + "numpy-kvikio": partial(run_numpy_like, xp=numpy), +} + + +def main(args): + cupy.cuda.set_allocator(None) # Disable CuPy's default memory pool + cupy.arange(10) # Make sure CUDA is initialized + + kvikio.defaults.num_threads_reset(args.nthreads) + print("Roundtrip benchmark") + print("--------------------------------------") + print(f"nelem | {args.nelem} ({format_bytes(args.nbytes)})") + print(f"dtype | {args.dtype}") + print(f"nthreads | {args.nthreads}") + print(f"nruns | {args.nruns}") + print(f"server | {args.server}") + if args.server is None: + print("--------------------------------------") + print("WARNING: the bundled server is slow, ") + print("consider using --server.") + print("======================================") + + # Run each benchmark using the requested APIs + for api in args.api: + res = [] + for elapsed in API[api](args): + res.append(elapsed) + + def pprint_api_res(name, samples): + samples = [args.nbytes / s for s in samples] # Convert to throughput + mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0] + ret = f"{api}-{name}".ljust(18) + ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14) + if len(samples) > 1: + stdev = statistics.stdev(samples) / mean * 100 + ret += " ± %5.2f %%" % stdev + ret += " (" + for sample in samples: + ret += f"{format_bytes(sample)}/s, " + ret = ret[:-2] + ")" # Replace trailing comma + return ret + + print(pprint_api_res("read", res)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="HTTP benchmark") + parser.add_argument( + "-n", + "--nelem", + metavar="NELEM", + default="1024", + type=int, + help="Number of elements (default: %(default)s).", + ) + parser.add_argument( + "--dtype", + metavar="DATATYPE", + default="float32", + type=numpy.dtype, + help="The data type of each element (default: %(default)s).", + ) + parser.add_argument( + "--nruns", + metavar="RUNS", + default=1, + type=int, + help="Number of runs per API (default: %(default)s).", + ) + parser.add_argument( + "-t", + "--nthreads", + metavar="THREADS", + default=1, + type=int, + help="Number of threads to use (default: %(default)s).", + ) + parser.add_argument( + "--server", + default=None, + help=( + "Connect to an external http server as opposed " + "to the bundled (very slow) HTTP server. " + "Remember to also set --server-root-path." + ), + ) + parser.add_argument( + "--server-root-path", + default=None, + help="Path to the root directory that `--server` serves (local path).", + ) + parser.add_argument( + "--bundled-server-lifetime", + metavar="SECONDS", + default=3600, + type=int, + help="Maximum lifetime of the bundled server (default: %(default)s).", + ) + parser.add_argument( + "--api", + metavar="API", + default=list(API.keys())[0], # defaults to the first API + nargs="+", + choices=tuple(API.keys()) + ("all",), + help="List of APIs to use {%(choices)s} (default: %(default)s).", + ) + args = parser.parse_args() + args.nbytes = args.nelem * args.dtype.itemsize + if "all" in args.api: + args.api = tuple(API.keys()) + + with contextlib.ExitStack() as context_stack: + if args.server is None: + # Create a tmp dir for the bundled server to serve + temp_dir = tempfile.TemporaryDirectory() + args.bundled_server_root_dir = pathlib.Path(temp_dir.name) + context_stack.enter_context(temp_dir) + + # Create the bundled server + bundled_server = LocalHttpServer( + root_path=args.bundled_server_root_dir, + range_support=True, + max_lifetime=args.bundled_server_lifetime, + ) + context_stack.enter_context(bundled_server) + args.server_url = bundled_server.url + args.server_root_path = args.bundled_server_root_dir + else: + args.server_url = args.server + if args.server_root_path is None: + raise ValueError("please set --server-root-path") + args.server_root_path = pathlib.Path(args.server_root_path) + main(args) diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py index d7229c2959..21cc7dbd74 100644 --- a/python/kvikio/kvikio/utils.py +++ b/python/kvikio/kvikio/utils.py @@ -3,6 +3,7 @@ import functools import multiprocessing +import pathlib import threading import time @@ -36,7 +37,9 @@ def _server( f"ThreadingHTTPServer shutting down because of timeout ({max_lifetime}sec)" ) - def __init__(self, root_path: str, range_support: bool, max_lifetime: int) -> None: + def __init__( + self, root_path: str | pathlib.Path, range_support: bool, max_lifetime: int + ) -> None: self.root_path = root_path self.range_support = range_support self.max_lifetime = max_lifetime From 0e6dc7af7fbe5fa8e39db9b9f4b1cbbbc8e3dbe7 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 24 Sep 2024 16:48:17 +0200 Subject: [PATCH 31/64] test_http_io --- python/kvikio/tests/test_benchmarks.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 3bdaf6613e..13709b9e0d 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -8,6 +8,8 @@ import pytest +import kvikio + benchmarks_path = ( Path(os.path.realpath(__file__)).parent.parent / "kvikio" / "benchmarks" ) @@ -78,3 +80,31 @@ def test_zarr_io(run_cmd, tmp_path, api): cwd=benchmarks_path, ) assert retcode == 0 + + +@pytest.mark.parametrize( + "api", + [ + "cupy-kvikio", + "numpy-kvikio", + ], +) +def test_http_io(run_cmd, tmp_path, api): + """Test benchmarks/http_io.py""" + + if not kvikio.is_remote_file_available(): + pytest.skip( + "cannot test remote IO, please build KvikIO with with AWS S3 support" + ) + retcode = run_cmd( + cmd=[ + sys.executable, + "http_io.py", + "-n", + "1000", + "--api", + api, + ], + cwd=benchmarks_path, + ) + assert retcode == 0 From f0b2f9faa390460ec377db4f0028cd5c66ff149d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 25 Sep 2024 14:43:44 +0200 Subject: [PATCH 32/64] example --- docs/source/api.rst | 7 ++++++ docs/source/index.rst | 1 + docs/source/remote_file.rst | 11 +++++++++ python/kvikio/examples/http_io.py | 37 ++++++++++++++++++++++++++++ python/kvikio/kvikio/utils.py | 5 +++- python/kvikio/tests/test_examples.py | 9 ++++++- 6 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 docs/source/remote_file.rst create mode 100644 python/kvikio/examples/http_io.py diff --git a/docs/source/api.rst b/docs/source/api.rst index 4d19c09bbb..fd34367a00 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -18,6 +18,13 @@ Zarr .. autoclass:: GDSStore :members: +RemoteFile +---------- +.. currentmodule:: kvikio.remote_file + +.. autoclass:: RemoteFile + :members: + Defaults -------- .. currentmodule:: kvikio.defaults diff --git a/docs/source/index.rst b/docs/source/index.rst index 4dd491fd96..9e302b5f44 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,6 +23,7 @@ Contents install quickstart zarr + remote_file runtime_settings api genindex diff --git a/docs/source/remote_file.rst b/docs/source/remote_file.rst new file mode 100644 index 0000000000..ed6fe45b7b --- /dev/null +++ b/docs/source/remote_file.rst @@ -0,0 +1,11 @@ +Remote File +=========== + +KvikIO provides direct access to remote files. + + +Example +------- + +.. literalinclude:: ../../python/kvikio/examples/http_io.py + :language: python diff --git a/python/kvikio/examples/http_io.py b/python/kvikio/examples/http_io.py new file mode 100644 index 0000000000..c74db75b99 --- /dev/null +++ b/python/kvikio/examples/http_io.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# See file LICENSE for terms. + +import pathlib +import tempfile + +import cupy +import numpy + +import kvikio +from kvikio.utils import LocalHttpServer + + +def main(tmpdir: pathlib.Path): + a = cupy.arange(100) + a.tofile(tmpdir / "myfile") + b = cupy.empty_like(a) + + # Start a local server that serves files in `tmpdir` + with LocalHttpServer(root_path=tmpdir) as server: + # Open remote file from a http url + with kvikio.RemoteFile.from_http_url(f"{server.url}/myfile") as f: + # KvikIO fetch the file size + assert f.nbytes() == a.nbytes + # Read the remote file into `b` as if it was a local file. + f.read(b) + assert all(a == b) + # We can also read into host memory seamlessly + a = cupy.asnumpy(a) + c = numpy.empty_like(a) + f.read(c) + assert all(a == c) + + +if __name__ == "__main__": + with tempfile.TemporaryDirectory() as tmpdir: + main(pathlib.Path(tmpdir)) diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py index 21cc7dbd74..74b55dba04 100644 --- a/python/kvikio/kvikio/utils.py +++ b/python/kvikio/kvikio/utils.py @@ -38,7 +38,10 @@ def _server( ) def __init__( - self, root_path: str | pathlib.Path, range_support: bool, max_lifetime: int + self, + root_path: str | pathlib.Path, + range_support: bool = True, + max_lifetime: int = 120, ) -> None: self.root_path = root_path self.range_support = range_support diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py index e9e1f83d08..493b2253d7 100644 --- a/python/kvikio/tests/test_examples.py +++ b/python/kvikio/tests/test_examples.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. # See file LICENSE for terms. import os @@ -26,3 +26,10 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch): monkeypatch.syspath_prepend(str(examples_path)) import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file") + + +def test_http_io(tmp_path, monkeypatch): + """Test examples/http_io.py""" + + monkeypatch.syspath_prepend(str(examples_path)) + import_module("http_io").main(tmp_path) From 8e7e2cfed209e7b6964db9964911d6c4d6973c47 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 26 Sep 2024 10:28:06 +0200 Subject: [PATCH 33/64] cleanup --- cpp/include/kvikio/remote_handle.hpp | 74 ++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 2ca4e6b900..b6d1cbb5e9 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -42,7 +42,8 @@ namespace detail { /** * @brief Singleton class to initialize and cleanup the global state of libcurl * - * https://curl.se/libcurl/c/libcurl.html + * Notice, libcurl allows the use of a singleton class: + * * In a C++ module, it is common to deal with the global constant situation by defining a special * class that represents the global constant environment of the module. A program always has exactly * one object of the class, in static storage. That way, the program automatically calls the @@ -52,6 +53,8 @@ namespace detail { * to think about it. (Caveat: If you are initializing libcurl from a Windows DLL you should not * initialize it from DllMain or a static initializer because Windows holds the loader lock during * that time and it could cause a deadlock.) + * + * Source . */ class LibCurl { public: @@ -80,11 +83,6 @@ class LibCurl { curl_global_cleanup(); } - LibCurl(LibCurl const&) = delete; - LibCurl& operator=(LibCurl const&) = delete; - LibCurl(LibCurl&& o) = delete; - LibCurl& operator=(LibCurl&& o) = delete; - public: static LibCurl& instance() { @@ -137,7 +135,10 @@ class LibCurl { }; /** - * @brief A wrapper of a curl easy handle pointer. + * @brief Representation of a curl easy handle pointer and its operations. + * + * An instance is given a `LibCurl::UniqueHandlePtr` on creation, which is + * later retailed on destruction. */ class CurlHandle { private: @@ -147,6 +148,15 @@ class CurlHandle { std::string _source_line; public: + /** + * @brief Construct a new curl handle. + * + * Typically, do not use this directly instead use the `create_curl_handle()` macro. + * + * @param handle An unused curl easy handle pointer, which is retailed on destruction. + * @param source_file Path of source file of the caller (for error messages). + * @param source_line Line of source file of the caller (for error messages). + */ CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line) : _handle{std::move(handle)}, _source_file(std::move(source_file)), @@ -164,13 +174,27 @@ class CurlHandle { } ~CurlHandle() noexcept { detail::LibCurl::instance().retain_handle(std::move(_handle)); } + /** + * @brief CurlHandle support is not movable or copyable. + */ CurlHandle(CurlHandle const&) = delete; CurlHandle& operator=(CurlHandle const&) = delete; CurlHandle(CurlHandle&& o) = delete; CurlHandle& operator=(CurlHandle&& o) = delete; + /** + * @brief Get the underlying curl easy handle pointer. + */ CURL* handle() noexcept { return _handle.get(); } + /** + * @brief Set option for the curl handle. + * + * See for available options. + * + * @tparam VAL The type of the value. + * @param option The curl option to set. + */ template void setopt(CURLoption option, VAL value) { @@ -182,6 +206,12 @@ class CurlHandle { throw std::runtime_error(ss.str()); } } + + /** + * @brief Perform a blocking network transfer using previously set options. + * + * See . + */ void perform() { // Perform the curl operation and check for errors. @@ -199,10 +229,18 @@ class CurlHandle { } } - template - void getinfo(INFO info, VALUE value) + /** + * @brief Extract information from a curl handle. + * + * See for available options. + * + * @tparam OUTPUT The type of the output. + * @param output The output, which is used as-is: `curl_easy_getinfo(..., output)`. + */ + template + void getinfo(CURLINFO info, OUTPUT output) { - CURLcode err = curl_easy_getinfo(handle(), info, value); + CURLcode err = curl_easy_getinfo(handle(), info, output); if (err != CURLE_OK) { std::stringstream ss; ss << "curl_easy_getinfo() error near " << _source_file << ":" << _source_line; @@ -212,6 +250,11 @@ class CurlHandle { } }; +/** + * @brief Create a new curl handle. + * + * @returns A `kvikio::detail::CurlHandle` instance ready to be used. + */ #define create_curl_handle() \ kvikio::detail::CurlHandle( \ kvikio::detail::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) @@ -234,11 +277,6 @@ inline std::size_t callback_host_memory(char* data, ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; } - - // std::cout << "callback_host_memory() - data: " << ((void*)data) - // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset - // << ", nbytes: " << nbytes << std::endl; - std::memcpy(ctx->buf + ctx->offset, data, nbytes); ctx->offset += nbytes; return nbytes; @@ -259,11 +297,6 @@ inline std::size_t callback_device_memory(char* data, CUcontext cuda_ctx = get_context_from_pointer(ctx->buf); PushAndPopContext c(cuda_ctx); CUstream stream = detail::StreamsByThread::get(); - - // std::cout << "callback_device_memory() - data: " << ((void*)data) - // << ", ctx->buf: " << (void*)ctx->buf << ", offset: " << ctx->offset - // << ", nbytes: " << nbytes << std::endl; - CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); // We have to sync since curl moght overwrite or free `data`. @@ -380,7 +413,6 @@ class RemoteHandle { .buf = reinterpret_cast(buf), .size = size, .offset = 0, .overflow_error = false}; curl.setopt(CURLOPT_WRITEDATA, &ctx); - // std::cout << "read() - buf: " << buf << ", byte_range: " << byte_range << std::endl; try { curl.perform(); } catch (std::runtime_error const& e) { From fa672191f36b05db0bd1164bdbdba721e320e154 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 29 Sep 2024 17:35:47 +0200 Subject: [PATCH 34/64] cmake: rapids_cpm_find CURL --- cpp/CMakeLists.txt | 6 +---- cpp/cmake/thirdparty/get_libcurl.cmake | 32 ++++++++++++++++++++++++++ python/kvikio/kvikio/remote_file.py | 3 ++- python/kvikio/tests/test_benchmarks.py | 3 ++- 4 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 cpp/cmake/thirdparty/get_libcurl.cmake diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8f464ba5d5..3bb65e1bbc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -51,11 +51,7 @@ rapids_find_package( ) if(KvikIO_REMOTE_SUPPORT) - rapids_find_package( - CURL 7.87.0 REQUIRED - BUILD_EXPORT_SET kvikio-exports - INSTALL_EXPORT_SET kvikio-exports - ) + include(cmake/thirdparty/get_libcurl.cmake) endif() rapids_find_package( diff --git a/cpp/cmake/thirdparty/get_libcurl.cmake b/cpp/cmake/thirdparty/get_libcurl.cmake new file mode 100644 index 0000000000..09b637d1cb --- /dev/null +++ b/cpp/cmake/thirdparty/get_libcurl.cmake @@ -0,0 +1,32 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds libcurl and sets any additional necessary environment variables. +function(find_and_configure_libcurl) + include(${rapids-cmake-dir}/cpm/find.cmake) + + rapids_cpm_find( + CURL 7.87.0 + GLOBAL_TARGETS libcurl + BUILD_EXPORT_SET kvikio-exports + INSTALL_EXPORT_SET kvikio-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/curl/curl + GIT_TAG curl-8_10_1 # let's get a newer release if we have to build it ourselves anyways. + OPTIONS "BUILD_CURL_EXE OFF" "BUILD_SHARED_LIBS OFF" "BUILD_TESTING OFF" "CURL_USE_LIBPSL OFF" + "CURL_DISABLE_LDAP ON" "CMAKE_POSITION_INDEPENDENT_CODE ON" + ) +endfunction() + +find_and_configure_libcurl() diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 90a8b81fb4..db8715c086 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -25,7 +25,8 @@ def _get_remote_module(): """Get the remote module or raise an error""" if not is_remote_file_available(): raise RuntimeError( - "RemoteFile not available, please build KvikIO with AWS S3 support" + "RemoteFile not available, please build KvikIO " + "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" ) import kvikio._lib.remote_handle diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 13709b9e0d..42c2fc94ca 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -94,7 +94,8 @@ def test_http_io(run_cmd, tmp_path, api): if not kvikio.is_remote_file_available(): pytest.skip( - "cannot test remote IO, please build KvikIO with with AWS S3 support" + "RemoteFile not available, please build KvikIO " + "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" ) retcode = run_cmd( cmd=[ From 9906a14875b9b8c95d577a744cf5f64a5fc3c066 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 30 Sep 2024 10:08:07 +0200 Subject: [PATCH 35/64] cleanup --- python/kvikio/tests/test_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 42c2fc94ca..98d54fb6b3 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -89,7 +89,7 @@ def test_zarr_io(run_cmd, tmp_path, api): "numpy-kvikio", ], ) -def test_http_io(run_cmd, tmp_path, api): +def test_http_io(run_cmd, api): """Test benchmarks/http_io.py""" if not kvikio.is_remote_file_available(): From d38121b26b5507471633d26326687991cc897b0e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 30 Sep 2024 13:16:53 +0200 Subject: [PATCH 36/64] doc --- cpp/include/kvikio/remote_handle.hpp | 44 ++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index b6d1cbb5e9..1836c5bade 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -309,17 +309,34 @@ inline std::size_t callback_device_memory(char* data, } // namespace detail /** - * @brief + * @brief Abstract base class for remote endpoints. + * + * In this context, an endpoint refers to a remote file using a specify communication protocol. + * + * Each communication protocol, such as HTTP or S3, needs to implement this ABC and implement + * their own ctor that takes communication protocol specific arguments. */ class RemoteEndpoint { public: - RemoteEndpoint() {} + /** + * @brief Set needed connection options on a curl handle. + * + * Subsequently, a call to `curl.perform()` should connect to the endpoint. + * + * @param curl The curl handle. + */ virtual void setopt(detail::CurlHandle& curl) = 0; - virtual std::string str() = 0; + + /** + * @brief Get a description of this remote point instance. + * + * @returns A string description. + */ + virtual std::string str() = 0; }; /** - * @brief + * @brief A remote endpoint using http. */ class HttpEndpoint : public RemoteEndpoint { private: @@ -341,10 +358,24 @@ class RemoteHandle { std::size_t _nbytes; public: + /** + * @brief Create a new remote handle from an endpoint and a file size. + * + * @param endpoint Remote endpoint used for subsequently IO. + * @param nbytes The size of the remote file (in bytes). + */ RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) : _endpoint{std::move(endpoint)}, _nbytes{nbytes} { } + + /** + * @brief Create a new remote handle from an endpoint (infers the file size). + * + * The file size is received from the remote server using `endpoint`. + * + * @param endpoint Remote endpoint used for subsequently IO. + */ RemoteHandle(std::unique_ptr endpoint) { auto curl = create_curl_handle(); @@ -364,10 +395,11 @@ class RemoteHandle { _endpoint = std::move(endpoint); } + // A remote handle is moveable but not copyable. + RemoteHandle(RemoteHandle&& o) = default; + RemoteHandle& operator=(RemoteHandle&& o) = default; RemoteHandle(RemoteHandle const&) = delete; RemoteHandle& operator=(RemoteHandle const&) = delete; - RemoteHandle(RemoteHandle&& o) = delete; - RemoteHandle& operator=(RemoteHandle&& o) = delete; /** * @brief Get the file size. From bdc1e2207e1b5c76515b4cde039a3ef47055713c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 30 Sep 2024 13:21:48 +0200 Subject: [PATCH 37/64] cleanup --- cpp/examples/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt index 284590e943..c12ddb2e52 100644 --- a/cpp/examples/CMakeLists.txt +++ b/cpp/examples/CMakeLists.txt @@ -14,8 +14,6 @@ set(TEST_INSTALL_PATH bin/tests/libkvikio) -# Example: basic_io - if(CUDAToolkit_FOUND) add_executable(BASIC_IO_TEST basic_io.cpp) set_target_properties(BASIC_IO_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") @@ -37,8 +35,6 @@ else() message(STATUS "Cannot build the basic_io example when CUDA is not found") endif() -# Example: basic_no_cuda - add_executable(BASIC_NO_CUDA_TEST basic_no_cuda.cpp) set_target_properties(BASIC_NO_CUDA_TEST PROPERTIES INSTALL_RPATH "\$ORIGIN/../../lib") target_include_directories(BASIC_NO_CUDA_TEST PRIVATE ../include) From 383fa4c67cb21c62fd58c68fe5ca05769e95f683 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 30 Sep 2024 13:39:49 +0200 Subject: [PATCH 38/64] clean up --- python/kvikio/kvikio/_lib/CMakeLists.txt | 5 ++++- python/kvikio/kvikio/utils.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index 775c0b76de..85da4d9b2b 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -21,7 +21,10 @@ if(TARGET CURL::libcurl) message(STATUS "Building remote_handle.pyx (libcurl found)") list(APPEND cython_modules remote_handle.pyx) else() - message(WARNING "Skipping remote_handle.pyx (libcurl not found or disabled)") + message( + WARNING + "Skipping remote_handle.pyx (please set KvikIO_REMOTE_SUPPORT=ON for remote file support)" + ) endif() rapids_cython_create_modules( diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py index 74b55dba04..c210878788 100644 --- a/python/kvikio/kvikio/utils.py +++ b/python/kvikio/kvikio/utils.py @@ -6,6 +6,7 @@ import pathlib import threading import time +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer class LocalHttpServer: @@ -18,8 +19,6 @@ def _server( range_support: bool, max_lifetime: int, ): - from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer - if range_support: from RangeHTTPServer import RangeRequestHandler From 0055373b2d62d5170037c158d6b76d05f2851a9d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 30 Sep 2024 13:52:50 +0200 Subject: [PATCH 39/64] docs --- python/kvikio/kvikio/utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py index c210878788..c7ed39c7a4 100644 --- a/python/kvikio/kvikio/utils.py +++ b/python/kvikio/kvikio/utils.py @@ -42,6 +42,24 @@ def __init__( range_support: bool = True, max_lifetime: int = 120, ) -> None: + """Create a context that starts a local http server. + + Example + ------- + >>> with LocalHttpServer(root_path="/my/server/") as server: + ... with kvikio.RemoteFile.from_http_url(f"{server.url}/myfile") as f: + ... f.read(...) + + Parameters + ---------- + root_path + Path to the directory the server will serve. + range_support + Whether to support the ranges, required by `RemoteFile.from_http_url()`. + Depend on the `RangeHTTPServer` module (`pip install rangehttpserver`). + max_lifetime + Maximum lifetime of the server (in seconds). + """ self.root_path = root_path self.range_support = range_support self.max_lifetime = max_lifetime From 628e525748a94acd0ada5bdd72f9d3411f5d8e56 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 08:22:53 +0200 Subject: [PATCH 40/64] doc --- python/kvikio/tests/test_http_io.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index 54d2d88f1d..eaa1d4b0b8 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -11,7 +11,10 @@ pytestmark = pytest.mark.skipif( not kvikio.is_remote_file_available(), - reason="cannot test remote IO, please build KvikIO with libcurl", + reason=( + "RemoteFile not available, please build KvikIO " + "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" + ), ) From 8726f87bee97f1eff8cc6c374ebdf70d1b63b12c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 09:32:06 +0200 Subject: [PATCH 41/64] fix virtual dtor --- cpp/include/kvikio/remote_handle.hpp | 3 ++- python/kvikio/kvikio/benchmarks/http_io.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 1836c5bade..8bdd75dfc3 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -333,6 +333,8 @@ class RemoteEndpoint { * @returns A string description. */ virtual std::string str() = 0; + + virtual ~RemoteEndpoint() = default; }; /** @@ -343,7 +345,6 @@ class HttpEndpoint : public RemoteEndpoint { std::string _url; public: - HttpEndpoint() = default; HttpEndpoint(std::string url) : _url{std::move(url)} {} void setopt(detail::CurlHandle& curl) override { curl.setopt(CURLOPT_URL, _url.c_str()); } std::string str() override { return _url; } diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py index 1234c28304..9a24503a7f 100644 --- a/python/kvikio/kvikio/benchmarks/http_io.py +++ b/python/kvikio/kvikio/benchmarks/http_io.py @@ -38,8 +38,8 @@ def run() -> float: API = { - "cupy-kvikio": partial(run_numpy_like, xp=cupy), - "numpy-kvikio": partial(run_numpy_like, xp=numpy), + "cupy": partial(run_numpy_like, xp=cupy), + "numpy": partial(run_numpy_like, xp=numpy), } From c94121dee317e8fce8bbf109d0d31c7ace628ff9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 10:27:10 +0200 Subject: [PATCH 42/64] fix benckmark --- cpp/include/kvikio/remote_handle.hpp | 1 + python/kvikio/tests/test_benchmarks.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 8bdd75dfc3..aa98b910b7 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -348,6 +348,7 @@ class HttpEndpoint : public RemoteEndpoint { HttpEndpoint(std::string url) : _url{std::move(url)} {} void setopt(detail::CurlHandle& curl) override { curl.setopt(CURLOPT_URL, _url.c_str()); } std::string str() override { return _url; } + ~HttpEndpoint() override = default; }; /** diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py index 98d54fb6b3..5b5602e53a 100644 --- a/python/kvikio/tests/test_benchmarks.py +++ b/python/kvikio/tests/test_benchmarks.py @@ -85,8 +85,8 @@ def test_zarr_io(run_cmd, tmp_path, api): @pytest.mark.parametrize( "api", [ - "cupy-kvikio", - "numpy-kvikio", + "cupy", + "numpy", ], ) def test_http_io(run_cmd, api): From 0c764ba1561ae1a08f421dccb53aa70dd01b4e6d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 14:14:16 +0200 Subject: [PATCH 43/64] clean up --- cpp/include/kvikio/remote_handle.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index aa98b910b7..ad0e7af01d 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -16,7 +16,8 @@ #pragma once #ifndef KVIKIO_LIBCURL_FOUND -#error "cannot include , configuration did not find libcurl" +#error \ + "cannot include , please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" #endif #include From 91d953b067d48185325a7a1da3dd40013e9e07d3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 14:26:56 +0200 Subject: [PATCH 44/64] libcurl.hpp --- cpp/include/kvikio/remote_handle.hpp | 234 +----------------------- cpp/include/kvikio/shim/libcurl.hpp | 260 +++++++++++++++++++++++++++ 2 files changed, 263 insertions(+), 231 deletions(-) create mode 100644 cpp/include/kvikio/shim/libcurl.hpp diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index ad0e7af01d..7edde788e3 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -15,251 +15,23 @@ */ #pragma once -#ifndef KVIKIO_LIBCURL_FOUND -#error \ - "cannot include , please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" -#endif - #include #include -#include #include #include #include #include -#include - -#include #include #include #include #include +#include #include namespace kvikio { namespace detail { -/** - * @brief Singleton class to initialize and cleanup the global state of libcurl - * - * Notice, libcurl allows the use of a singleton class: - * - * In a C++ module, it is common to deal with the global constant situation by defining a special - * class that represents the global constant environment of the module. A program always has exactly - * one object of the class, in static storage. That way, the program automatically calls the - * constructor of the object as the program starts up and the destructor as it terminates. As the - * author of this libcurl-using module, you can make the constructor call curl_global_init and the - * destructor call curl_global_cleanup and satisfy libcurl's requirements without your user having - * to think about it. (Caveat: If you are initializing libcurl from a Windows DLL you should not - * initialize it from DllMain or a static initializer because Windows holds the loader lock during - * that time and it could cause a deadlock.) - * - * Source . - */ -class LibCurl { - public: - // We hold an unique pointer to the raw curl handle and sets `curl_easy_cleanup` as its Deleter. - using UniqueHandlePtr = std::unique_ptr>; - - private: - std::mutex _mutex{}; - // Curl handles free to be used. - std::vector _free_curl_handles{}; - - LibCurl() - { - CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT); - if (err != CURLE_OK) { - throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err)); - } - curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW); - if ((ver->features & CURL_VERSION_THREADSAFE) == 0) { - throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled"); - } - } - ~LibCurl() noexcept - { - _free_curl_handles.clear(); - curl_global_cleanup(); - } - - public: - static LibCurl& instance() - { - static LibCurl _instance; - return _instance; - } - - /** - * @brief Returns a free curl handle if available. - */ - UniqueHandlePtr get_free_handle() - { - UniqueHandlePtr ret; - std::lock_guard const lock(_mutex); - if (!_free_curl_handles.empty()) { - ret = std::move(_free_curl_handles.back()); - _free_curl_handles.pop_back(); - } - return ret; - } - - /** - * @brief Returns a curl handle, create a new handle if none is available. - */ - UniqueHandlePtr get_handle() - { - // Check if we have a free handle available. - UniqueHandlePtr ret = get_free_handle(); - if (ret) { - curl_easy_reset(ret.get()); - } else { - // If not, we create a new handle. - CURL* raw_handle = curl_easy_init(); - if (raw_handle == nullptr) { - throw std::runtime_error("libcurl: call to curl_easy_init() failed"); - } - ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup); - } - return ret; - } - - /** - * @brief Retain a curl handle for later use. - */ - void retain_handle(UniqueHandlePtr handle) - { - std::lock_guard const lock(_mutex); - _free_curl_handles.push_back(std::move(handle)); - } -}; - -/** - * @brief Representation of a curl easy handle pointer and its operations. - * - * An instance is given a `LibCurl::UniqueHandlePtr` on creation, which is - * later retailed on destruction. - */ -class CurlHandle { - private: - char _errbuf[CURL_ERROR_SIZE]; - LibCurl::UniqueHandlePtr _handle; - std::string _source_file; - std::string _source_line; - - public: - /** - * @brief Construct a new curl handle. - * - * Typically, do not use this directly instead use the `create_curl_handle()` macro. - * - * @param handle An unused curl easy handle pointer, which is retailed on destruction. - * @param source_file Path of source file of the caller (for error messages). - * @param source_line Line of source file of the caller (for error messages). - */ - CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line) - : _handle{std::move(handle)}, - _source_file(std::move(source_file)), - _source_line(std::move(source_line)) - { - // Need CURLOPT_NOSIGNAL to support threading, see - // - setopt(CURLOPT_NOSIGNAL, 1L); - - // We always set CURLOPT_ERRORBUFFER to get better error messages. - setopt(CURLOPT_ERRORBUFFER, _errbuf); - - // Make curl_easy_perform() fail when receiving HTTP code errors. - setopt(CURLOPT_FAILONERROR, 1L); - } - ~CurlHandle() noexcept { detail::LibCurl::instance().retain_handle(std::move(_handle)); } - - /** - * @brief CurlHandle support is not movable or copyable. - */ - CurlHandle(CurlHandle const&) = delete; - CurlHandle& operator=(CurlHandle const&) = delete; - CurlHandle(CurlHandle&& o) = delete; - CurlHandle& operator=(CurlHandle&& o) = delete; - - /** - * @brief Get the underlying curl easy handle pointer. - */ - CURL* handle() noexcept { return _handle.get(); } - - /** - * @brief Set option for the curl handle. - * - * See for available options. - * - * @tparam VAL The type of the value. - * @param option The curl option to set. - */ - template - void setopt(CURLoption option, VAL value) - { - CURLcode err = curl_easy_setopt(handle(), option, value); - if (err != CURLE_OK) { - std::stringstream ss; - ss << "curl_easy_setopt() error near " << _source_file << ":" << _source_line; - ss << "(" << curl_easy_strerror(err) << ")"; - throw std::runtime_error(ss.str()); - } - } - - /** - * @brief Perform a blocking network transfer using previously set options. - * - * See . - */ - void perform() - { - // Perform the curl operation and check for errors. - CURLcode err = curl_easy_perform(handle()); - if (err != CURLE_OK) { - std::string msg(_errbuf); - std::stringstream ss; - ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; - if (msg.empty()) { - ss << "(" << curl_easy_strerror(err) << ")"; - } else { - ss << "(" << msg << ")"; - } - throw std::runtime_error(ss.str()); - } - } - - /** - * @brief Extract information from a curl handle. - * - * See for available options. - * - * @tparam OUTPUT The type of the output. - * @param output The output, which is used as-is: `curl_easy_getinfo(..., output)`. - */ - template - void getinfo(CURLINFO info, OUTPUT output) - { - CURLcode err = curl_easy_getinfo(handle(), info, output); - if (err != CURLE_OK) { - std::stringstream ss; - ss << "curl_easy_getinfo() error near " << _source_file << ":" << _source_line; - ss << "(" << curl_easy_strerror(err) << ")"; - throw std::runtime_error(ss.str()); - } - } -}; - -/** - * @brief Create a new curl handle. - * - * @returns A `kvikio::detail::CurlHandle` instance ready to be used. - */ -#define create_curl_handle() \ - kvikio::detail::CurlHandle( \ - kvikio::detail::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) - struct CallbackContext { char* buf; std::size_t size; @@ -326,7 +98,7 @@ class RemoteEndpoint { * * @param curl The curl handle. */ - virtual void setopt(detail::CurlHandle& curl) = 0; + virtual void setopt(CurlHandle& curl) = 0; /** * @brief Get a description of this remote point instance. @@ -347,7 +119,7 @@ class HttpEndpoint : public RemoteEndpoint { public: HttpEndpoint(std::string url) : _url{std::move(url)} {} - void setopt(detail::CurlHandle& curl) override { curl.setopt(CURLOPT_URL, _url.c_str()); } + void setopt(CurlHandle& curl) override { curl.setopt(CURLOPT_URL, _url.c_str()); } std::string str() override { return _url; } ~HttpEndpoint() override = default; }; diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp new file mode 100644 index 0000000000..cf885f914c --- /dev/null +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef KVIKIO_LIBCURL_FOUND +#error \ + "cannot include the remote IO API, please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace kvikio { + +/** + * @brief Singleton class to initialize and cleanup the global state of libcurl + * + * Notice, libcurl allows the use of a singleton class: + * + * In a C++ module, it is common to deal with the global constant situation by defining a special + * class that represents the global constant environment of the module. A program always has exactly + * one object of the class, in static storage. That way, the program automatically calls the + * constructor of the object as the program starts up and the destructor as it terminates. As the + * author of this libcurl-using module, you can make the constructor call curl_global_init and the + * destructor call curl_global_cleanup and satisfy libcurl's requirements without your user having + * to think about it. (Caveat: If you are initializing libcurl from a Windows DLL you should not + * initialize it from DllMain or a static initializer because Windows holds the loader lock during + * that time and it could cause a deadlock.) + * + * Source . + */ +class LibCurl { + public: + // We hold an unique pointer to the raw curl handle and sets `curl_easy_cleanup` as its Deleter. + using UniqueHandlePtr = std::unique_ptr>; + + private: + std::mutex _mutex{}; + // Curl handles free to be used. + std::vector _free_curl_handles{}; + + LibCurl() + { + CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT); + if (err != CURLE_OK) { + throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err)); + } + curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW); + if ((ver->features & CURL_VERSION_THREADSAFE) == 0) { + throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled"); + } + } + ~LibCurl() noexcept + { + _free_curl_handles.clear(); + curl_global_cleanup(); + } + + public: + static LibCurl& instance() + { + static LibCurl _instance; + return _instance; + } + + /** + * @brief Returns a free curl handle if available. + */ + UniqueHandlePtr get_free_handle() + { + UniqueHandlePtr ret; + std::lock_guard const lock(_mutex); + if (!_free_curl_handles.empty()) { + ret = std::move(_free_curl_handles.back()); + _free_curl_handles.pop_back(); + } + return ret; + } + + /** + * @brief Returns a curl handle, create a new handle if none is available. + */ + UniqueHandlePtr get_handle() + { + // Check if we have a free handle available. + UniqueHandlePtr ret = get_free_handle(); + if (ret) { + curl_easy_reset(ret.get()); + } else { + // If not, we create a new handle. + CURL* raw_handle = curl_easy_init(); + if (raw_handle == nullptr) { + throw std::runtime_error("libcurl: call to curl_easy_init() failed"); + } + ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup); + } + return ret; + } + + /** + * @brief Retain a curl handle for later use. + */ + void retain_handle(UniqueHandlePtr handle) + { + std::lock_guard const lock(_mutex); + _free_curl_handles.push_back(std::move(handle)); + } +}; + +/** + * @brief Representation of a curl easy handle pointer and its operations. + * + * An instance is given a `LibCurl::UniqueHandlePtr` on creation, which is + * later retailed on destruction. + */ +class CurlHandle { + private: + char _errbuf[CURL_ERROR_SIZE]; + LibCurl::UniqueHandlePtr _handle; + std::string _source_file; + std::string _source_line; + + public: + /** + * @brief Construct a new curl handle. + * + * Typically, do not use this directly instead use the `create_curl_handle()` macro. + * + * @param handle An unused curl easy handle pointer, which is retailed on destruction. + * @param source_file Path of source file of the caller (for error messages). + * @param source_line Line of source file of the caller (for error messages). + */ + CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line) + : _handle{std::move(handle)}, + _source_file(std::move(source_file)), + _source_line(std::move(source_line)) + { + // Need CURLOPT_NOSIGNAL to support threading, see + // + setopt(CURLOPT_NOSIGNAL, 1L); + + // We always set CURLOPT_ERRORBUFFER to get better error messages. + setopt(CURLOPT_ERRORBUFFER, _errbuf); + + // Make curl_easy_perform() fail when receiving HTTP code errors. + setopt(CURLOPT_FAILONERROR, 1L); + } + ~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); } + + /** + * @brief CurlHandle support is not movable or copyable. + */ + CurlHandle(CurlHandle const&) = delete; + CurlHandle& operator=(CurlHandle const&) = delete; + CurlHandle(CurlHandle&& o) = delete; + CurlHandle& operator=(CurlHandle&& o) = delete; + + /** + * @brief Get the underlying curl easy handle pointer. + */ + CURL* handle() noexcept { return _handle.get(); } + + /** + * @brief Set option for the curl handle. + * + * See for available options. + * + * @tparam VAL The type of the value. + * @param option The curl option to set. + */ + template + void setopt(CURLoption option, VAL value) + { + CURLcode err = curl_easy_setopt(handle(), option, value); + if (err != CURLE_OK) { + std::stringstream ss; + ss << "curl_easy_setopt() error near " << _source_file << ":" << _source_line; + ss << "(" << curl_easy_strerror(err) << ")"; + throw std::runtime_error(ss.str()); + } + } + + /** + * @brief Perform a blocking network transfer using previously set options. + * + * See . + */ + void perform() + { + // Perform the curl operation and check for errors. + CURLcode err = curl_easy_perform(handle()); + if (err != CURLE_OK) { + std::string msg(_errbuf); + std::stringstream ss; + ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; + if (msg.empty()) { + ss << "(" << curl_easy_strerror(err) << ")"; + } else { + ss << "(" << msg << ")"; + } + throw std::runtime_error(ss.str()); + } + } + + /** + * @brief Extract information from a curl handle. + * + * See for available options. + * + * @tparam OUTPUT The type of the output. + * @param output The output, which is used as-is: `curl_easy_getinfo(..., output)`. + */ + template + void getinfo(CURLINFO info, OUTPUT output) + { + CURLcode err = curl_easy_getinfo(handle(), info, output); + if (err != CURLE_OK) { + std::stringstream ss; + ss << "curl_easy_getinfo() error near " << _source_file << ":" << _source_line; + ss << "(" << curl_easy_strerror(err) << ")"; + throw std::runtime_error(ss.str()); + } + } +}; + +/** + * @brief Create a new curl handle. + * + * @returns A `kvikio::CurlHandle` instance ready to be used. + */ +#define create_curl_handle() \ + kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(), __FILE__, KVIKIO_STRINGIFY(__LINE__)) + +} // namespace kvikio From 94e47eea3edc0e5b3033cd40655fa9d5bc785307 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 14:38:56 +0200 Subject: [PATCH 45/64] rename: RemoteFile.open_http --- python/kvikio/examples/http_io.py | 2 +- python/kvikio/kvikio/_lib/remote_handle.pyx | 2 +- python/kvikio/kvikio/benchmarks/http_io.py | 2 +- python/kvikio/kvikio/remote_file.py | 6 +++--- python/kvikio/kvikio/utils.py | 4 ++-- python/kvikio/tests/test_http_io.py | 10 +++++----- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/kvikio/examples/http_io.py b/python/kvikio/examples/http_io.py index c74db75b99..26c9af1d44 100644 --- a/python/kvikio/examples/http_io.py +++ b/python/kvikio/examples/http_io.py @@ -19,7 +19,7 @@ def main(tmpdir: pathlib.Path): # Start a local server that serves files in `tmpdir` with LocalHttpServer(root_path=tmpdir) as server: # Open remote file from a http url - with kvikio.RemoteFile.from_http_url(f"{server.url}/myfile") as f: + with kvikio.RemoteFile.open_http(f"{server.url}/myfile") as f: # KvikIO fetch the file size assert f.nbytes() == a.nbytes # Read the remote file into `b` as if it was a local file. diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx index 4d0cd052f3..5e58da32f0 100644 --- a/python/kvikio/kvikio/_lib/remote_handle.pyx +++ b/python/kvikio/kvikio/_lib/remote_handle.pyx @@ -51,7 +51,7 @@ cdef class RemoteFile: cdef unique_ptr[cpp_RemoteHandle] _handle @classmethod - def from_url( + def open_http( cls, url: str, nbytes: Optional[int], diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py index 9a24503a7f..68d4643004 100644 --- a/python/kvikio/kvikio/benchmarks/http_io.py +++ b/python/kvikio/kvikio/benchmarks/http_io.py @@ -26,7 +26,7 @@ def run_numpy_like(args, xp): def run() -> float: t0 = time.perf_counter() - with kvikio.RemoteFile.from_http_url(url, nbytes=src.nbytes) as f: + with kvikio.RemoteFile.open_http(url, nbytes=src.nbytes) as f: res = f.read(dst) t1 = time.perf_counter() assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}" diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index db8715c086..0b2e886f0b 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -40,7 +40,7 @@ def __init__(self, handle): """Create a remote file from a Cython handle. This constructor should not be called directly instead use a - factory method like `RemoteFile.from_http_url()` + factory method like `RemoteFile.open_http()` Parameters ---------- @@ -51,7 +51,7 @@ def __init__(self, handle): self._handle = handle @classmethod - def from_http_url( + def open_http( cls, url: str, nbytes: Optional[int] = None, @@ -66,7 +66,7 @@ def from_http_url( The size of the file. If None, KvikIO will ask the server for the file size. """ - return RemoteFile(_get_remote_module().RemoteFile.from_url(url, nbytes)) + return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes)) def __enter__(self) -> RemoteFile: return self diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py index c7ed39c7a4..09a9f2062a 100644 --- a/python/kvikio/kvikio/utils.py +++ b/python/kvikio/kvikio/utils.py @@ -47,7 +47,7 @@ def __init__( Example ------- >>> with LocalHttpServer(root_path="/my/server/") as server: - ... with kvikio.RemoteFile.from_http_url(f"{server.url}/myfile") as f: + ... with kvikio.RemoteFile.open_http(f"{server.url}/myfile") as f: ... f.read(...) Parameters @@ -55,7 +55,7 @@ def __init__( root_path Path to the directory the server will serve. range_support - Whether to support the ranges, required by `RemoteFile.from_http_url()`. + Whether to support the ranges, required by `RemoteFile.open_http()`. Depend on the `RangeHTTPServer` module (`pip install rangehttpserver`). max_lifetime Maximum lifetime of the server (in seconds). diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py index eaa1d4b0b8..70abec71b6 100644 --- a/python/kvikio/tests/test_http_io.py +++ b/python/kvikio/tests/test_http_io.py @@ -32,7 +32,7 @@ def http_server(request, tmpdir): def test_file_size(http_server, tmpdir): a = np.arange(100) a.tofile(tmpdir / "a") - with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: + with kvikio.RemoteFile.open_http(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes @@ -45,7 +45,7 @@ def test_read(http_server, tmpdir, xp, size, nthreads, tasksize): with kvikio.defaults.set_num_threads(nthreads): with kvikio.defaults.set_task_size(tasksize): - with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: + with kvikio.RemoteFile.open_http(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) assert f.read(b) == a.nbytes @@ -58,7 +58,7 @@ def test_large_read(http_server, tmpdir, xp, nthreads): a.tofile(tmpdir / "a") with kvikio.defaults.set_num_threads(nthreads): - with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: + with kvikio.RemoteFile.open_http(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes b = xp.empty_like(a) assert f.read(b) == a.nbytes @@ -69,7 +69,7 @@ def test_error_too_small_file(http_server, tmpdir, xp): a = xp.arange(10, dtype="uint8") b = xp.empty(100, dtype="uint8") a.tofile(tmpdir / "a") - with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: + with kvikio.RemoteFile.open_http(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes with pytest.raises( ValueError, match=r"cannot read 0\+100 bytes into a 10 bytes file" @@ -86,7 +86,7 @@ def test_no_range_support(http_server, tmpdir, xp): a = xp.arange(100, dtype="uint8") a.tofile(tmpdir / "a") b = xp.empty_like(a) - with kvikio.RemoteFile.from_http_url(f"{http_server}/a") as f: + with kvikio.RemoteFile.open_http(f"{http_server}/a") as f: assert f.nbytes() == a.nbytes with pytest.raises( OverflowError, match="maybe the server doesn't support file ranges?" From fcb935813f3dc4dec8e99bc4b236f03e184fa588 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 15:07:25 +0200 Subject: [PATCH 46/64] doc --- cpp/include/kvikio/remote_handle.hpp | 34 +++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 7edde788e3..602cb2a485 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -32,13 +32,26 @@ namespace kvikio { namespace detail { +/** + * @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks. + */ struct CallbackContext { - char* buf; - std::size_t size; - std::size_t offset; - bool overflow_error; + char* buf; // Output buffer to read into. + std::size_t size; // Total number of bytes to read. + std::size_t offset; // Offset into `buf` to start reading. + bool overflow_error; // Flag to indicate overflow. }; +/** + * @brief A "CURLOPT_WRITEFUNCTION" to copy downloaded data to the output host buffer. + * + * See . + * + * @param data Data downloaded by libcurl that is ready for consumption. + * @param size Size of each element in `nmemb`; size is always 1. + * @param nmemb Size of the data in `nmemb`. + * @param context A pointer to `CallbackContext`. + */ inline std::size_t callback_host_memory(char* data, std::size_t size, std::size_t nmemb, @@ -55,6 +68,16 @@ inline std::size_t callback_host_memory(char* data, return nbytes; } +/** + * @brief A "CURLOPT_WRITEFUNCTION" to copy downloaded data to the output device buffer. + * + * See . + * + * @param data Data downloaded by libcurl that is ready for consumption. + * @param size Size of each element in `nmemb`; size is always 1. + * @param nmemb Size of the data in `nmemb`. + * @param context A pointer to `CallbackContext`. + */ inline std::size_t callback_device_memory(char* data, std::size_t size, std::size_t nmemb, @@ -72,7 +95,7 @@ inline std::size_t callback_device_memory(char* data, CUstream stream = detail::StreamsByThread::get(); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); - // We have to sync since curl moght overwrite or free `data`. + // We have to sync since curl might overwrite or free `data`. CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream)); ctx->offset += nbytes; @@ -165,7 +188,6 @@ class RemoteHandle { throw std::runtime_error("cannot get size of " + endpoint->str() + ", content-length not provided by the server"); } - _nbytes = cl; _endpoint = std::move(endpoint); } From e8018d10aef9660c0dd0a2f85269dd0e970effe3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 15:11:00 +0200 Subject: [PATCH 47/64] cleanup --- cpp/include/kvikio/remote_handle.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 602cb2a485..1bcf983ed5 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -50,15 +50,15 @@ struct CallbackContext { * @param data Data downloaded by libcurl that is ready for consumption. * @param size Size of each element in `nmemb`; size is always 1. * @param nmemb Size of the data in `nmemb`. - * @param context A pointer to `CallbackContext`. + * @param context A pointer to an instance of `CallbackContext`. */ inline std::size_t callback_host_memory(char* data, std::size_t size, std::size_t nmemb, void* context) { - auto ctx = reinterpret_cast(context); - std::size_t nbytes = size * nmemb; + auto ctx = reinterpret_cast(context); + const std::size_t nbytes = size * nmemb; if (ctx->size < ctx->offset + nbytes) { ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; @@ -76,15 +76,15 @@ inline std::size_t callback_host_memory(char* data, * @param data Data downloaded by libcurl that is ready for consumption. * @param size Size of each element in `nmemb`; size is always 1. * @param nmemb Size of the data in `nmemb`. - * @param context A pointer to `CallbackContext`. + * @param context A pointer to an instance of `CallbackContext`. */ inline std::size_t callback_device_memory(char* data, std::size_t size, std::size_t nmemb, void* context) { - auto ctx = reinterpret_cast(context); - std::size_t nbytes = size * nmemb; + auto ctx = reinterpret_cast(context); + const std::size_t nbytes = size * nmemb; if (ctx->size < ctx->offset + nbytes) { ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; From bb737b96098557091480adecaf8863e8087d34cc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Oct 2024 15:24:10 +0200 Subject: [PATCH 48/64] PushAndPopContext outside callback_device_memory() --- cpp/include/kvikio/remote_handle.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 1bcf983ed5..436d07c91d 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -90,8 +90,6 @@ inline std::size_t callback_device_memory(char* data, return CURL_WRITEFUNC_ERROR; } - CUcontext cuda_ctx = get_context_from_pointer(ctx->buf); - PushAndPopContext c(cuda_ctx); CUstream stream = detail::StreamsByThread::get(); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( convert_void2deviceptr(ctx->buf + ctx->offset), data, nbytes, stream)); @@ -225,15 +223,15 @@ class RemoteHandle { << " bytes file (" << _endpoint->str() << ")"; throw std::invalid_argument(ss.str()); } - - auto curl = create_curl_handle(); + const bool is_host_mem = is_host_memory(buf); + auto curl = create_curl_handle(); _endpoint->setopt(curl); std::string const byte_range = std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1); curl.setopt(CURLOPT_RANGE, byte_range.c_str()); - if (is_host_memory(buf)) { + if (is_host_mem) { curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_host_memory); } else { curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_device_memory); @@ -243,7 +241,12 @@ class RemoteHandle { curl.setopt(CURLOPT_WRITEDATA, &ctx); try { - curl.perform(); + if (is_host_mem) { + curl.perform(); + } else { + PushAndPopContext c(get_context_from_pointer(buf)); + curl.perform(); + } } catch (std::runtime_error const& e) { if (ctx.overflow_error) { std::stringstream ss; From b8a9c0a92137db2c7f7e2f456ce11ea0e930a493 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 2 Oct 2024 14:16:10 +0200 Subject: [PATCH 49/64] more nvtx --- cpp/include/kvikio/remote_handle.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 436d07c91d..c518077f17 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -63,6 +63,7 @@ inline std::size_t callback_host_memory(char* data, ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; } + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle - callback_host_memory()", nbytes); std::memcpy(ctx->buf + ctx->offset, data, nbytes); ctx->offset += nbytes; return nbytes; @@ -89,6 +90,7 @@ inline std::size_t callback_device_memory(char* data, ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; } + KVIKIO_NVTX_FUNC_RANGE("RemoteHandle - callback_device_memory()", nbytes); CUstream stream = detail::StreamsByThread::get(); CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync( From 9cffe1c8bdcb899965917947b21a69967ff7d470 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:10:55 +0200 Subject: [PATCH 50/64] Apply suggestions from code review Co-authored-by: Bradley Dice Co-authored-by: Lawrence Mitchell --- cpp/CMakeLists.txt | 2 +- cpp/include/kvikio/remote_handle.hpp | 8 ++++---- cpp/include/kvikio/shim/libcurl.hpp | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8ebc4e2120..e5afbc1d71 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -36,7 +36,7 @@ rapids_cmake_write_version_file(include/kvikio/version_config.hpp) rapids_cmake_build_type(Release) # build options -option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote io support" ON) +option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON) option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON) option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index c518077f17..d22297b310 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -58,7 +58,7 @@ inline std::size_t callback_host_memory(char* data, void* context) { auto ctx = reinterpret_cast(context); - const std::size_t nbytes = size * nmemb; + std::size_t const nbytes = size * nmemb; if (ctx->size < ctx->offset + nbytes) { ctx->overflow_error = true; return CURL_WRITEFUNC_ERROR; @@ -107,10 +107,10 @@ inline std::size_t callback_device_memory(char* data, /** * @brief Abstract base class for remote endpoints. * - * In this context, an endpoint refers to a remote file using a specify communication protocol. + * In this context, an endpoint refers to a remote file using a specific communication protocol. * * Each communication protocol, such as HTTP or S3, needs to implement this ABC and implement - * their own ctor that takes communication protocol specific arguments. + * its own ctor that takes communication protocol specific arguments. */ class RemoteEndpoint { public: @@ -159,7 +159,7 @@ class RemoteHandle { /** * @brief Create a new remote handle from an endpoint and a file size. * - * @param endpoint Remote endpoint used for subsequently IO. + * @param endpoint Remote endpoint used for subsequent IO. * @param nbytes The size of the remote file (in bytes). */ RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp index cf885f914c..61b959bbb8 100644 --- a/cpp/include/kvikio/shim/libcurl.hpp +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -57,7 +57,7 @@ namespace kvikio { */ class LibCurl { public: - // We hold an unique pointer to the raw curl handle and sets `curl_easy_cleanup` as its Deleter. + // We hold a unique pointer to the raw curl handle and set `curl_easy_cleanup` as its Deleter. using UniqueHandlePtr = std::unique_ptr>; private: @@ -137,7 +137,7 @@ class LibCurl { * @brief Representation of a curl easy handle pointer and its operations. * * An instance is given a `LibCurl::UniqueHandlePtr` on creation, which is - * later retailed on destruction. + * later retained on destruction. */ class CurlHandle { private: @@ -152,7 +152,7 @@ class CurlHandle { * * Typically, do not use this directly instead use the `create_curl_handle()` macro. * - * @param handle An unused curl easy handle pointer, which is retailed on destruction. + * @param handle An unused curl easy handle pointer, which is retained on destruction. * @param source_file Path of source file of the caller (for error messages). * @param source_line Line of source file of the caller (for error messages). */ From 1257a9fe56180eec8e8b664f6f63c9efe1155193 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:14:25 +0200 Subject: [PATCH 51/64] std::function --- cpp/include/kvikio/shim/libcurl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp index 61b959bbb8..a8c5390f4e 100644 --- a/cpp/include/kvikio/shim/libcurl.hpp +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -58,7 +58,7 @@ namespace kvikio { class LibCurl { public: // We hold a unique pointer to the raw curl handle and set `curl_easy_cleanup` as its Deleter. - using UniqueHandlePtr = std::unique_ptr>; + using UniqueHandlePtr = std::unique_ptr>; private: std::mutex _mutex{}; From 1006b241ca8ba6d69372cc213fab40505bc31ad4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:23:56 +0200 Subject: [PATCH 52/64] getinfo: take output pointer --- cpp/include/kvikio/shim/libcurl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp index a8c5390f4e..cee50c5947 100644 --- a/cpp/include/kvikio/shim/libcurl.hpp +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -237,7 +237,7 @@ class CurlHandle { * @param output The output, which is used as-is: `curl_easy_getinfo(..., output)`. */ template - void getinfo(CURLINFO info, OUTPUT output) + void getinfo(CURLINFO info, OUTPUT* output) { CURLcode err = curl_easy_getinfo(handle(), info, output); if (err != CURLE_OK) { From 9ba9752916bf2cfd962b715494d10876c3d5b799 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:25:16 +0200 Subject: [PATCH 53/64] STATUS --- python/kvikio/kvikio/_lib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt index 29f72c1074..9f2118bb61 100644 --- a/python/kvikio/kvikio/_lib/CMakeLists.txt +++ b/python/kvikio/kvikio/_lib/CMakeLists.txt @@ -22,7 +22,7 @@ if(TARGET CURL::libcurl) list(APPEND cython_modules remote_handle.pyx) else() message( - WARNING + STATUS "Skipping remote_handle.pyx (please set KvikIO_REMOTE_SUPPORT=ON for remote file support)" ) endif() From 128e3ddf62b2ae7993a4dec2c1d35febdc665ace Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:28:25 +0200 Subject: [PATCH 54/64] close --- python/kvikio/kvikio/remote_file.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py index 0b2e886f0b..52bbe8010f 100644 --- a/python/kvikio/kvikio/remote_file.py +++ b/python/kvikio/kvikio/remote_file.py @@ -68,11 +68,15 @@ def open_http( """ return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes)) + def close(self) -> None: + """Close the file""" + pass + def __enter__(self) -> RemoteFile: return self def __exit__(self, exc_type, exc_val, exc_tb) -> None: - pass + self.close() def nbytes(self) -> int: """Get the file size. From 0df172677834ffecd5302e795d0fb05eff271a66 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:33:27 +0200 Subject: [PATCH 55/64] test_http_io: check is_remote_file_available --- python/kvikio/tests/test_examples.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py index 493b2253d7..07be1fc156 100644 --- a/python/kvikio/tests/test_examples.py +++ b/python/kvikio/tests/test_examples.py @@ -7,6 +7,8 @@ import pytest +import kvikio + examples_path = Path(os.path.realpath(__file__)).parent / ".." / "examples" @@ -31,5 +33,11 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch): def test_http_io(tmp_path, monkeypatch): """Test examples/http_io.py""" + if not kvikio.is_remote_file_available(): + pytest.skip( + "RemoteFile not available, please build KvikIO " + "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" + ) + monkeypatch.syspath_prepend(str(examples_path)) import_module("http_io").main(tmp_path) From 96af5ac6b479a2d8a21e95aeb8cd4018450a6702 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 09:37:54 +0200 Subject: [PATCH 56/64] std::ptrdiff_t --- cpp/include/kvikio/remote_handle.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index d22297b310..886e93d30a 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -36,10 +36,10 @@ namespace detail { * @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks. */ struct CallbackContext { - char* buf; // Output buffer to read into. - std::size_t size; // Total number of bytes to read. - std::size_t offset; // Offset into `buf` to start reading. - bool overflow_error; // Flag to indicate overflow. + char* buf; // Output buffer to read into. + std::size_t size; // Total number of bytes to read. + std::ptrdiff_t offset; // Offset into `buf` to start reading. + bool overflow_error; // Flag to indicate overflow. }; /** From feccdd9a76d3c1175064bb1046f66ff41b888eb7 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:08:28 +0200 Subject: [PATCH 57/64] Apply suggestions from code review Co-authored-by: Bradley Dice --- dependencies.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index b13ade8417..1ac4922640 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -344,13 +344,9 @@ dependencies: - &dask dask>=2022.05.2 - pytest - pytest-cov - - output_types: [requirements, pyproject] + - output_types: [conda, requirements, pyproject] packages: - rangehttpserver - - output_types: conda - packages: - - pip: - - rangehttpserver specific: - output_types: [conda, requirements, pyproject] matrices: From 7dcc8f3f02b4f214411f2265a4314e099b2066f5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:11:27 +0200 Subject: [PATCH 58/64] use conda's rangehttpserver --- conda/environments/all_cuda-118_arch-aarch64.yaml | 3 +-- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 +-- conda/environments/all_cuda-125_arch-aarch64.yaml | 3 +-- conda/environments/all_cuda-125_arch-x86_64.yaml | 3 +-- dependencies.yaml | 2 -- 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index cee9bc495a..0e7f4b3e21 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -29,6 +29,7 @@ dependencies: - pytest - pytest-cov - python>=3.10,<3.13 +- rangehttpserver - rapids-build-backend>=0.3.0,<0.4.0.dev0 - scikit-build-core>=0.10.0 - sphinx @@ -36,6 +37,4 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-aarch64=2.17 - zarr -- pip: - - rangehttpserver name: all_cuda-118_arch-aarch64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6aa0a40289..293085e8f7 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -31,6 +31,7 @@ dependencies: - pytest - pytest-cov - python>=3.10,<3.13 +- rangehttpserver - rapids-build-backend>=0.3.0,<0.4.0.dev0 - scikit-build-core>=0.10.0 - sphinx @@ -38,6 +39,4 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-64=2.17 - zarr -- pip: - - rangehttpserver name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 321431dbed..1e4a370ff6 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -29,6 +29,7 @@ dependencies: - pytest - pytest-cov - python>=3.10,<3.13 +- rangehttpserver - rapids-build-backend>=0.3.0,<0.4.0.dev0 - scikit-build-core>=0.10.0 - sphinx @@ -36,6 +37,4 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-aarch64=2.17 - zarr -- pip: - - rangehttpserver name: all_cuda-125_arch-aarch64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 5123bc012d..44d8772a71 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -29,6 +29,7 @@ dependencies: - pytest - pytest-cov - python>=3.10,<3.13 +- rangehttpserver - rapids-build-backend>=0.3.0,<0.4.0.dev0 - scikit-build-core>=0.10.0 - sphinx @@ -36,6 +37,4 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-64=2.17 - zarr -- pip: - - rangehttpserver name: all_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 1ac4922640..39ba3aaa17 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -344,8 +344,6 @@ dependencies: - &dask dask>=2022.05.2 - pytest - pytest-cov - - output_types: [conda, requirements, pyproject] - packages: - rangehttpserver specific: - output_types: [conda, requirements, pyproject] From 60f73310f84fcb55ff525a371291637c14054ed9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:14:59 +0200 Subject: [PATCH 59/64] doc --- cpp/include/kvikio/remote_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 886e93d30a..026eff77e8 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -270,7 +270,7 @@ class RemoteHandle { * @param size Number of bytes to read. * @param file_offset File offset in bytes. * @param task_size Size of each task in bytes. - * @return Number of bytes read, which is `size` always. + * @return Future that on completion returns the size of bytes read, which is always `size`. */ std::future pread(void* buf, std::size_t size, From 13e0b7a10a629e1eb723ee49bd449a305896aeeb Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:29:40 +0200 Subject: [PATCH 60/64] CallbackContext ctor --- cpp/include/kvikio/remote_handle.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index 026eff77e8..e036ebcb37 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -40,6 +40,10 @@ struct CallbackContext { std::size_t size; // Total number of bytes to read. std::ptrdiff_t offset; // Offset into `buf` to start reading. bool overflow_error; // Flag to indicate overflow. + CallbackContext(void* buf, std::size_t size) + : buf{static_cast(buf)}, size{size}, offset{0}, overflow_error{0} + { + } }; /** @@ -238,8 +242,7 @@ class RemoteHandle { } else { curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_device_memory); } - detail::CallbackContext ctx{ - .buf = reinterpret_cast(buf), .size = size, .offset = 0, .overflow_error = false}; + detail::CallbackContext ctx{buf, size}; curl.setopt(CURLOPT_WRITEDATA, &ctx); try { From 474e694613ce4922cf6beb05ab0e44964c8b7fcc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:49:25 +0200 Subject: [PATCH 61/64] host: libcurl==7.87.0 --- conda/recipes/libkvikio/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 233bbd4727..7c2327f258 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -52,7 +52,7 @@ requirements: {% else %} - libcufile-dev # [linux] {% endif %} - - libcurl>=7.87.0 + - libcurl==7.87.0 outputs: - name: libkvikio @@ -75,6 +75,7 @@ outputs: - cmake {{ cmake_version }} host: - cuda-version ={{ cuda_version }} + - libcurl==7.87.0 run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} From 9022b18ec65c17b3cf900dcae03e6fe05d6c4c24 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:51:18 +0200 Subject: [PATCH 62/64] run: remove libcurl>=7.87.0 --- conda/recipes/libkvikio/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml index 7c2327f258..999b9fc2c1 100644 --- a/conda/recipes/libkvikio/meta.yaml +++ b/conda/recipes/libkvikio/meta.yaml @@ -85,7 +85,6 @@ outputs: {% else %} - libcufile-dev # [linux] {% endif %} - - libcurl>=7.87.0 test: commands: - test -f $PREFIX/include/kvikio/file_handle.hpp From 8bbb33be5eb697dabe0f5618f43a592cad4e6fdc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 12:54:52 +0200 Subject: [PATCH 63/64] curl-7_87_0 --- cpp/cmake/thirdparty/get_libcurl.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_libcurl.cmake b/cpp/cmake/thirdparty/get_libcurl.cmake index 09b637d1cb..7695592737 100644 --- a/cpp/cmake/thirdparty/get_libcurl.cmake +++ b/cpp/cmake/thirdparty/get_libcurl.cmake @@ -23,7 +23,7 @@ function(find_and_configure_libcurl) INSTALL_EXPORT_SET kvikio-exports CPM_ARGS GIT_REPOSITORY https://github.com/curl/curl - GIT_TAG curl-8_10_1 # let's get a newer release if we have to build it ourselves anyways. + GIT_TAG curl-7_87_0 OPTIONS "BUILD_CURL_EXE OFF" "BUILD_SHARED_LIBS OFF" "BUILD_TESTING OFF" "CURL_USE_LIBPSL OFF" "CURL_DISABLE_LDAP ON" "CMAKE_POSITION_INDEPENDENT_CODE ON" ) From 1176b67282e8204dd22bf03df61d4b89b799ca14 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 3 Oct 2024 19:10:52 +0200 Subject: [PATCH 64/64] kvikio/meta.yaml: libcurl==7.87.0 --- conda/recipes/kvikio/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml index 4a352012e3..3c41af3310 100644 --- a/conda/recipes/kvikio/meta.yaml +++ b/conda/recipes/kvikio/meta.yaml @@ -64,6 +64,7 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - libkvikio ={{ version }} + - libcurl==7.87.0 run: - python - numpy >=1.23,<3.0a0