Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Order dictionary indicies to prevent 2x-50x size regression #94

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dwio/nimble/common/Vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ class Vector {
std::copy(first, last, dataRawPtr_);
}

template <typename It>
Vector(velox::memory::MemoryPool* memoryPool, size_t size, It first, It last)
: memoryPool_{memoryPool} {
init(size);
std::copy(first, last, dataRawPtr_);
}

Vector(const Vector& other) {
*this = other;
}
Expand Down
10 changes: 10 additions & 0 deletions dwio/nimble/common/tests/VectorTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ TEST_F(VectorTests, FromRange) {
EXPECT_EQ(6, v1[2]);
}

TEST_F(VectorTests, FromRangeWithSize) {
std::vector<int32_t> source{4, 5, 6};
nimble::Vector<int32_t> v1(
pool_.get(), source.size(), source.begin(), source.end());
EXPECT_EQ(3, v1.size());
EXPECT_EQ(4, v1[0]);
EXPECT_EQ(5, v1[1]);
EXPECT_EQ(6, v1[2]);
}

TEST_F(VectorTests, EqualOp1) {
nimble::Vector<int32_t> v1(pool_.get());
v1.push_back(1);
Expand Down
47 changes: 40 additions & 7 deletions dwio/nimble/encodings/DictionaryEncoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@
#include <span>
#include "dwio/nimble/common/Buffer.h"
#include "dwio/nimble/common/EncodingPrimitives.h"
#include "dwio/nimble/common/EncodingType.h"
#include "dwio/nimble/common/Types.h"
#include "dwio/nimble/common/Vector.h"
#include "dwio/nimble/encodings/Encoding.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
#include "dwio/nimble/encodings/EncodingIdentifier.h"
#include "dwio/nimble/encodings/EncodingSelection.h"
#include "dwio/nimble/encodings/TrivialEncoding.h"
#include "folly/container/F14Map.h"
#include "velox/common/memory/Memory.h"

// A dictionary encoded stream is comprised of two pieces: a mapping from the
// n unique values in a stream to the integers [0, n) and the vector of indices
Expand Down Expand Up @@ -205,10 +202,46 @@ std::string_view DictionaryEncoding<T>::encode(
alphabetMapping.reserve(alphabetCount);
Vector<physicalType> alphabet{&buffer.getMemoryPool()};
alphabet.reserve(alphabetCount);
uint32_t index = 0;
for (const auto& pair : selection.statistics().uniqueCounts()) {
alphabet.push_back(pair.first);
alphabetMapping.emplace(pair.first, index++);

/// Indicies are usually stored with VARINT encoding which depends on the
/// number of set bits in the value.
///
/// 127 (1 << 7) is the maximum number that can be stored in one byte with
/// VARINT encoding. Meaning that if the alphabet has less than 127 unique
/// values, then all of them can be stored as one byte per index value.
///
/// If the alphabet has more than 127 unique values, then we need to put more
/// frequent alphabet values at the beginning of the alphabet to reduce the
/// number of bytes needed to store the indices encoded as VARINT.
///
/// This sorting optimization gives 3-5x size reduction if you compare most
/// and least optimal order of indicies when they use VARINT encoding.
if (alphabetCount > (1 << 7)) {
const auto& uniqueCounts = selection.statistics().uniqueCounts();
Vector<std::pair<physicalType, uint64_t>> sortedAlphabet(
&buffer.getMemoryPool(),
uniqueCounts.size(),
uniqueCounts.cbegin(),
uniqueCounts.cend());
sort(
sortedAlphabet.begin(),
sortedAlphabet.end(),
[](const std::pair<physicalType, uint32_t>& a,
const std::pair<physicalType, uint32_t>& b) {
return a.second < b.second;
});

uint32_t index = 0;
for (const auto& pair : sortedAlphabet) {
alphabet.push_back(pair.first);
alphabetMapping.emplace(pair.first, index++);
}
} else {
uint32_t index = 0;
for (const auto& pair : selection.statistics().uniqueCounts()) {
alphabet.push_back(pair.first);
alphabetMapping.emplace(pair.first, index++);
}
}

Vector<uint32_t> indices{&buffer.getMemoryPool()};
Expand Down
6 changes: 6 additions & 0 deletions dwio/nimble/encodings/Statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ namespace facebook::nimble {
template <typename T, typename InputType = T>
class UniqueValueCounts {
public:
// NOTICE: absl::flat_hash_map has incosistent iteration order
// every single time, this might be a problem for some encodings that depend
// on the order of the elements, such as dictionary indicies encoded with
// VARINT encoding. This also means that the order of the elements in the
// dictionary alphabet will be different every time giving incosistent file
// sizes.
using MapType = absl::flat_hash_map<T, uint64_t>;

struct Iterator {
Expand Down