diff --git a/dwio/nimble/common/Vector.h b/dwio/nimble/common/Vector.h index 3694bde..47bc9e4 100644 --- a/dwio/nimble/common/Vector.h +++ b/dwio/nimble/common/Vector.h @@ -64,6 +64,13 @@ class Vector { std::copy(first, last, dataRawPtr_); } + template + Vector(velox::memory::MemoryPool* memoryPool, size_t size, It first, It last) + : memoryPool_{memoryPool} { + init(size); + std::copy(first, last, dataRawPtr_); + } + Vector(const Vector& other) { *this = other; } diff --git a/dwio/nimble/common/tests/VectorTests.cpp b/dwio/nimble/common/tests/VectorTests.cpp index 75fe94d..de697d9 100644 --- a/dwio/nimble/common/tests/VectorTests.cpp +++ b/dwio/nimble/common/tests/VectorTests.cpp @@ -50,6 +50,16 @@ TEST_F(VectorTests, FromRange) { EXPECT_EQ(6, v1[2]); } +TEST_F(VectorTests, FromRangeWithSize) { + std::vector source{4, 5, 6}; + nimble::Vector v1( + pool_.get(), source.size(), source.begin(), source.end()); + EXPECT_EQ(3, v1.size()); + EXPECT_EQ(4, v1[0]); + EXPECT_EQ(5, v1[1]); + EXPECT_EQ(6, v1[2]); +} + TEST_F(VectorTests, EqualOp1) { nimble::Vector v1(pool_.get()); v1.push_back(1); diff --git a/dwio/nimble/encodings/DictionaryEncoding.h b/dwio/nimble/encodings/DictionaryEncoding.h index 42568d7..1b62d2c 100644 --- a/dwio/nimble/encodings/DictionaryEncoding.h +++ b/dwio/nimble/encodings/DictionaryEncoding.h @@ -18,16 +18,13 @@ #include #include "dwio/nimble/common/Buffer.h" #include "dwio/nimble/common/EncodingPrimitives.h" -#include "dwio/nimble/common/EncodingType.h" #include "dwio/nimble/common/Types.h" #include "dwio/nimble/common/Vector.h" #include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingFactory.h" #include "dwio/nimble/encodings/EncodingIdentifier.h" #include "dwio/nimble/encodings/EncodingSelection.h" -#include "dwio/nimble/encodings/TrivialEncoding.h" #include "folly/container/F14Map.h" -#include "velox/common/memory/Memory.h" // A dictionary encoded stream is comprised of two pieces: a mapping from the // n unique values in a stream to the integers [0, n) and the vector of indices @@ -205,10 +202,46 @@ std::string_view DictionaryEncoding::encode( alphabetMapping.reserve(alphabetCount); Vector alphabet{&buffer.getMemoryPool()}; alphabet.reserve(alphabetCount); - uint32_t index = 0; - for (const auto& pair : selection.statistics().uniqueCounts()) { - alphabet.push_back(pair.first); - alphabetMapping.emplace(pair.first, index++); + + /// Indicies are usually stored with VARINT encoding which depends on the + /// number of set bits in the value. + /// + /// 127 (1 << 7) is the maximum number that can be stored in one byte with + /// VARINT encoding. Meaning that if the alphabet has less than 127 unique + /// values, then all of them can be stored as one byte per index value. + /// + /// If the alphabet has more than 127 unique values, then we need to put more + /// frequent alphabet values at the beginning of the alphabet to reduce the + /// number of bytes needed to store the indices encoded as VARINT. + /// + /// This sorting optimization gives 3-5x size reduction if you compare most + /// and least optimal order of indicies when they use VARINT encoding. + if (alphabetCount > (1 << 7)) { + const auto& uniqueCounts = selection.statistics().uniqueCounts(); + Vector> sortedAlphabet( + &buffer.getMemoryPool(), + uniqueCounts.size(), + uniqueCounts.cbegin(), + uniqueCounts.cend()); + sort( + sortedAlphabet.begin(), + sortedAlphabet.end(), + [](const std::pair& a, + const std::pair& b) { + return a.second < b.second; + }); + + uint32_t index = 0; + for (const auto& pair : sortedAlphabet) { + alphabet.push_back(pair.first); + alphabetMapping.emplace(pair.first, index++); + } + } else { + uint32_t index = 0; + for (const auto& pair : selection.statistics().uniqueCounts()) { + alphabet.push_back(pair.first); + alphabetMapping.emplace(pair.first, index++); + } } Vector indices{&buffer.getMemoryPool()}; diff --git a/dwio/nimble/encodings/Statistics.h b/dwio/nimble/encodings/Statistics.h index a4009a0..37d990b 100644 --- a/dwio/nimble/encodings/Statistics.h +++ b/dwio/nimble/encodings/Statistics.h @@ -30,6 +30,12 @@ namespace facebook::nimble { template class UniqueValueCounts { public: + // NOTICE: absl::flat_hash_map has incosistent iteration order + // every single time, this might be a problem for some encodings that depend + // on the order of the elements, such as dictionary indicies encoded with + // VARINT encoding. This also means that the order of the elements in the + // dictionary alphabet will be different every time giving incosistent file + // sizes. using MapType = absl::flat_hash_map; struct Iterator {