Skip to content

Commit

Permalink
GH-2988: Supports disabling statistics for specific columns (#2989)
Browse files Browse the repository at this point in the history
  • Loading branch information
ConeyLiu authored Aug 28, 2024
1 parent 3ac860e commit 3b5fb4b
Show file tree
Hide file tree
Showing 8 changed files with 353 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public class ParquetProperties {
public static final double DEFAULT_BLOOM_FILTER_FPP = 0.01;
public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false;
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
public static final boolean DEFAULT_STATISTICS_ENABLED = true;

public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;

Expand Down Expand Up @@ -122,6 +123,7 @@ public static WriterVersion fromString(String name) {
private final boolean pageWriteChecksumEnabled;
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
private final Map<String, String> extraMetaData;
private final ColumnProperty<Boolean> statistics;

private ParquetProperties(Builder builder) {
this.pageSizeThreshold = builder.pageSize;
Expand Down Expand Up @@ -149,6 +151,7 @@ private ParquetProperties(Builder builder) {
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
this.extraMetaData = builder.extraMetaData;
this.statistics = builder.statistics.build();
}

public static Builder builder() {
Expand Down Expand Up @@ -330,6 +333,10 @@ public Map<String, String> getExtraMetaData() {
return extraMetaData;
}

public boolean getStatisticsEnabled(ColumnDescriptor column) {
return statistics.getValue(column);
}

@Override
public String toString() {
return "Parquet page size to " + getPageSizeThreshold() + '\n'
Expand Down Expand Up @@ -372,6 +379,7 @@ public static class Builder {
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
private Map<String, String> extraMetaData = new HashMap<>();
private final ColumnProperty.Builder<Boolean> statistics;

private Builder() {
enableDict = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED);
Expand All @@ -387,6 +395,7 @@ private Builder() {
ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED);
numBloomFilterCandidates =
ColumnProperty.<Integer>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER);
statistics = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_STATISTICS_ENABLED);
}

private Builder(ParquetProperties toCopy) {
Expand All @@ -409,6 +418,7 @@ private Builder(ParquetProperties toCopy) {
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
this.extraMetaData = toCopy.extraMetaData;
this.statistics = ColumnProperty.builder(toCopy.statistics);
}

/**
Expand Down Expand Up @@ -657,6 +667,18 @@ public Builder withExtraMetaData(Map<String, String> extraMetaData) {
return this;
}

/**
* Enable or disable the statistics for given column. All column statistics are enabled by default.
*
* @param columnPath the given column
* @param enabled enable or disable
* @return this builder for method chaining
*/
public Builder withStatisticsEnabled(String columnPath, boolean enabled) {
this.statistics.withValue(columnPath, enabled);
return this;
}

public ParquetProperties build() {
ParquetProperties properties = new ParquetProperties(this);
// we pass a constructed but uninitialized factory to ParquetProperties above as currently
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,23 @@
class ColumnValueCollector {

private final ColumnDescriptor path;
private final boolean statisticsEnabled;
private BloomFilterWriter bloomFilterWriter;
private BloomFilter bloomFilter;
private Statistics<?> statistics;
private SizeStatistics.Builder sizeStatisticsBuilder;

ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
this.path = path;
this.statisticsEnabled = props.getStatisticsEnabled(path);
resetPageStatistics();
initBloomFilter(bloomFilterWriter, props);
}

void resetPageStatistics() {
this.statistics = Statistics.createStats(path.getPrimitiveType());
this.statistics = statisticsEnabled
? Statistics.createStats(path.getPrimitiveType())
: Statistics.noopStats(path.getPrimitiveType());
this.sizeStatisticsBuilder = SizeStatistics.newBuilder(
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.column.statistics;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType;

/**
* A noop statistics which always return empty.
*/
class NoopStatistics<T extends Comparable<T>> extends Statistics<T> {

NoopStatistics(PrimitiveType type) {
super(type);
}

@Override
public void updateStats(int value) {}

@Override
public void updateStats(long value) {}

@Override
public void updateStats(float value) {}

@Override
public void updateStats(double value) {}

@Override
public void updateStats(boolean value) {}

@Override
public void updateStats(Binary value) {}

@Override
public boolean equals(Object other) {
if (other == this) return true;
if (!(other instanceof Statistics)) return false;
Statistics stats = (Statistics) other;
return type().equals(stats.type());
}

@Override
public int hashCode() {
return 31 * type().hashCode();
}

@Override
protected void mergeStatisticsMinMax(Statistics stats) {}

@Override
public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {}

@Override
public T genericGetMin() {
throw new UnsupportedOperationException(
"genericGetMin is not supported by " + getClass().getName());
}

@Override
public T genericGetMax() {
throw new UnsupportedOperationException(
"genericGetMax is not supported by " + getClass().getName());
}

@Override
public byte[] getMaxBytes() {
throw new UnsupportedOperationException(
"getMaxBytes is not supported by " + getClass().getName());
}

@Override
public byte[] getMinBytes() {
throw new UnsupportedOperationException(
"getMinBytes is not supported by " + getClass().getName());
}

@Override
String stringify(T value) {
throw new UnsupportedOperationException(
"stringify is not supported by " + getClass().getName());
}

@Override
public boolean isSmallerThan(long size) {
throw new UnsupportedOperationException(
"isSmallerThan is not supported by " + getClass().getName());
}

@Override
public long getNumNulls() {
return -1;
}

@Override
public boolean isEmpty() {
return true;
}

@Override
public boolean hasNonNullValue() {
return false;
}

@Override
public boolean isNumNullsSet() {
return false;
}

@Override
public Statistics<T> copy() {
return new NoopStatistics<>(this.type());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,15 @@ public static Statistics<?> createStats(Type type) {
}
}

/**
* Creates a noop {@code NoopStatistics} statistics instance. This is only used when the user disables statistics for the specified column.
* @param type type of the column
* @return a noop statistics
*/
public static Statistics<?> noopStats(Type type) {
return new NoopStatistics<>(type.asPrimitiveType());
}

/**
* Returns a builder to create new statistics object. Used to read the statistics from the parquet file.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

import java.nio.ByteBuffer;
Expand Down Expand Up @@ -900,4 +902,29 @@ public void testSpecBuilderForDouble() {
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
}

@Test
public void testNoopStatistics() {
// Test basic max/min
integerArray = new int[] {1, 3, 14, 54, 66, 8, 0, 23, 54};
Statistics<?> stats = Statistics.noopStats(new PrimitiveType(REQUIRED, INT32, "int32"));
assertTrue(stats.isEmpty());

for (int i : integerArray) {
stats.updateStats(i);
}

assertEquals(stats.getNumNulls(), -1);
assertFalse(stats.hasNonNullValue());
assertFalse(stats.isNumNullsSet());
assertTrue(stats.isEmpty());

assertThrows(UnsupportedOperationException.class, stats::genericGetMax);
assertThrows(UnsupportedOperationException.class, stats::genericGetMin);
assertThrows(UnsupportedOperationException.class, stats::getMaxBytes);
assertThrows(UnsupportedOperationException.class, stats::getMinBytes);
assertThrows(UnsupportedOperationException.class, stats::maxAsString);
assertThrows(UnsupportedOperationException.class, stats::minAsString);
assertThrows(UnsupportedOperationException.class, () -> stats.isSmallerThan(0));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,18 @@ public SELF config(String property, String value) {
return self();
}

/**
* Sets the statistics enabled/disabled for the specified column. All column statistics are enabled by default.
*
* @param columnPath the path of the column (dot-string)
* @param enabled whether to write calculate statistics for the column
* @return this builder for method chaining
*/
public SELF withStatisticsEnabled(String columnPath, boolean enabled) {
encodingPropsBuilder.withStatisticsEnabled(columnPath, enabled);
return self();
}

/**
* Build a {@link ParquetWriter} with the accumulated configuration.
*
Expand Down
Loading

0 comments on commit 3b5fb4b

Please sign in to comment.