From 4c5ec7df514a67ffd5d222b57c4778b6fffe79e5 Mon Sep 17 00:00:00 2001 From: Hussain Naeem <47661256+Hoosayin@users.noreply.github.com> Date: Mon, 31 Jul 2023 15:06:09 +0500 Subject: [PATCH] Improve writing large arrays (#18) Merged FesapiHdfProxy::createSubArrayNd() in FesapiHdfProxy::writeSubArrayNd() --- src/etp/fesapi/FesapiHdfProxy.cpp | 135 +++++++++++++----------------- src/etp/fesapi/FesapiHdfProxy.h | 10 +-- 2 files changed, 61 insertions(+), 84 deletions(-) diff --git a/src/etp/fesapi/FesapiHdfProxy.cpp b/src/etp/fesapi/FesapiHdfProxy.cpp index 4ffc9d4..bb405e7 100644 --- a/src/etp/fesapi/FesapiHdfProxy.cpp +++ b/src/etp/fesapi/FesapiHdfProxy.cpp @@ -115,68 +115,6 @@ std::vector FesapiHdfProxy::getElementCountPerDimension(const std::str return result; } -template -void FesapiHdfProxy::writeSubArrayNd( - const std::string& uri, - const std::string& pathInResource, - std::vector& totalCounts, - std::vector starts, - std::vector counts, - const void* values) -{ - // Calculate array size - size_t totalCount{ 1 }; - - for (const auto& count : counts) { - totalCount *= count; - } - - // [Base Condition] Array size is OK to be transmitted. - if ((totalCount * sizeof(T)) <= maxArraySize_) { - - // PUT DATA SUBARRAYS - Energistics::Etp::v12::Protocol::DataArray::PutDataSubarrays pdsa{}; - pdsa.dataSubarrays["0"].uid.uri = uri; - pdsa.dataSubarrays["0"].uid.pathInResource = pathInResource; - pdsa.dataSubarrays["0"].starts = starts; - pdsa.dataSubarrays["0"].counts = counts; - - // Cast values in T values. - const T* typeValues{ static_cast(values) }; - - // Create 1D Array for Sub Values. - T* subValues = new T[totalCount]; - size_t valueIndex{ 0 }; - - // Recursively populate subValues starting from first dimension. - populateSubValuesNd( - 0, - totalCounts, starts, counts, - valueIndex, typeValues, subValues); - - // Create AVRO Array - Energistics::Etp::v12::Datatypes::AnyArray data; - createAnyArray(data, totalCount, subValues); // Type-specific code is written in explicit specializations for createAnyArray(). - pdsa.dataSubarrays["0"].data = data; - - std::cout << "Writing subarray..." << std::endl; - - // Send putDataSubarrays Message - session_->sendAndBlock(pdsa, 0, 0x02); - - // Delete Array - delete[] subValues; - } - // [Divide and Conquer Approach] If sub array is still large, partition it into more sub arrays. - else { - // Recursively divide all dimensions starting from first dimension. - createSubArrayNd( - 0, - uri, pathInResource, totalCounts, - starts, counts, values); - } -} - template void FesapiHdfProxy::populateSubValuesNd( size_t dimensionIndex, @@ -245,7 +183,7 @@ int64_t FesapiHdfProxy::getCountsProduct( } template -void FesapiHdfProxy::createSubArrayNd( +void FesapiHdfProxy::writeSubArrayNd( size_t dimensionIndex, const std::string& uri, const std::string& pathInResource, @@ -254,15 +192,56 @@ void FesapiHdfProxy::createSubArrayNd( std::vector counts, const void* values) { - // [Base Condition] If dimensionIndex exceeds the last dimension. - if (dimensionIndex >= starts.size()) { - // Recursively Write Subarray. + // Calculate array size + size_t totalCount{ 1 }; + + for (const auto& count : counts) { + totalCount *= count; + } + + // [Base Condition] If subarray can be transmitted. + if ((totalCount * sizeof(T)) <= maxArraySize_) { + // PUT DATA SUBARRAYS + Energistics::Etp::v12::Protocol::DataArray::PutDataSubarrays pdsa{}; + pdsa.dataSubarrays["0"].uid.uri = uri; + pdsa.dataSubarrays["0"].uid.pathInResource = pathInResource; + pdsa.dataSubarrays["0"].starts = starts; + pdsa.dataSubarrays["0"].counts = counts; + + // Cast values in T values. + const T* typeValues{ static_cast(values) }; + + // Create 1D Array for Sub Values. + T* subValues = new T[totalCount]; + size_t valueIndex{ 0 }; + + // Recursively populate subValues starting from first dimension. + populateSubValuesNd( + 0, + totalCounts, starts, counts, + valueIndex, typeValues, subValues); + + // Create AVRO Array + Energistics::Etp::v12::Datatypes::AnyArray data; + createAnyArray(data, totalCount, subValues); // Type-specific code is written in explicit specializations for createAnyArray(). + pdsa.dataSubarrays["0"].data = data; + + std::cout << "Writing subarray..." << std::endl; + + // Send putDataSubarrays Message + session_->sendAndBlock(pdsa, 0, 0x02); + + // Delete Array + delete[] subValues; + } + // Again divide all dimensions starting from first dimension. + else if (dimensionIndex >= starts.size()) { writeSubArrayNd( + 0, uri, pathInResource, totalCounts, - starts, - counts, - values); + starts, counts, values); } + // Divide the values of current dimension in halves. else { int64_t numberOfValues = counts[dimensionIndex]; @@ -273,7 +252,7 @@ void FesapiHdfProxy::createSubArrayNd( newCounts[dimensionIndex] = firstHalfValues; // Recursively divide next dimension. - createSubArrayNd( + writeSubArrayNd( dimensionIndex + 1, uri, pathInResource, totalCounts, starts, @@ -285,7 +264,7 @@ void FesapiHdfProxy::createSubArrayNd( newCounts[dimensionIndex] = secondHalfValues; // Recursively divide next dimension. - createSubArrayNd( + writeSubArrayNd( dimensionIndex + 1, uri, pathInResource, totalCounts, newStarts, @@ -543,43 +522,41 @@ void FesapiHdfProxy::writeArrayNd(const std::string & groupName, // Recursively Write Subarrays if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::DOUBLE) { - writeSubArrayNd(uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); } else if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::FLOAT) { - writeSubArrayNd(uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); } else if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::INT64 || datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::UINT64) { - writeSubArrayNd(uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); } else if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::INT32 || datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::UINT32) { - writeSubArrayNd(uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); } else if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::INT16 || datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::UINT16) { - writeSubArrayNd( - uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); } else if (datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::INT8 || datatype == COMMON_NS::AbstractObject::numericalDatatypeEnum::UINT8) { - writeSubArrayNd( - uri, pathInResource, counts, + writeSubArrayNd(0, uri, pathInResource, counts, starts, counts, values); diff --git a/src/etp/fesapi/FesapiHdfProxy.h b/src/etp/fesapi/FesapiHdfProxy.h index 774a6ef..ab23825 100644 --- a/src/etp/fesapi/FesapiHdfProxy.h +++ b/src/etp/fesapi/FesapiHdfProxy.h @@ -139,14 +139,14 @@ namespace ETP_NS * @param counts The number of values in each dimension of the subarray to be written. * @param values 1d array of specific datatype ordered firstly by fastest direction. */ - template + /*template void writeSubArrayNd( const std::string& uri, const std::string& pathInResource, std::vector& totalCounts, std::vector starts, std::vector counts, - const void* values); + const void* values);*/ /** * Recursively populate subValues array from original values array. @@ -201,7 +201,7 @@ namespace ETP_NS std::vector& totalCounts); /** - * Recursively divide each dimension into half and create a new nD subarray. + * Recursively write sub arrays (potentially with 2 dimensions) of a specific datatype into the HDF file by means of a single dataset. * @param dimensionIndex The index of dimension in nD array. * @param uri The uri of the original array. * @param pathInResource The path of the original array. @@ -211,7 +211,7 @@ namespace ETP_NS * @param values 1d array of specific datatype ordered firstly by fastest direction. */ template - void createSubArrayNd( + void writeSubArrayNd( size_t dimensionIndex, const std::string& uri, const std::string& pathInResource, @@ -575,7 +575,7 @@ namespace ETP_NS AbstractSession* session_; unsigned int compressionLevel; std::string xmlNs_; - int maxArraySize_{ 4000000 }; // Bytes + int maxArraySize_{ 12000000 }; // Bytes Energistics::Etp::v12::Datatypes::DataArrayTypes::DataArrayIdentifier buildDataArrayIdentifier(const std::string & datasetName) const; Energistics::Etp::v12::Protocol::DataArray::GetDataArrays buildGetDataArraysMessage(const std::string & datasetName) const;