private static OptionalLong getDistinctValuesCount(HiveColumnStatistics statistics) { if (statistics.getBooleanStatistics().isPresent() && statistics.getBooleanStatistics().get().getFalseCount().isPresent() && statistics.getBooleanStatistics().get().getTrueCount().isPresent()) { long falseCount = statistics.getBooleanStatistics().get().getFalseCount().getAsLong(); long trueCount = statistics.getBooleanStatistics().get().getTrueCount().getAsLong(); return OptionalLong.of((falseCount > 0 ? 1 : 0) + (trueCount > 0 ? 1 : 0)); } if (statistics.getDistinctValuesCount().isPresent()) { return statistics.getDistinctValuesCount(); } return OptionalLong.empty(); }
private static ColumnStatisticsObj createDecimalStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) { DecimalColumnStatsData data = new DecimalColumnStatsData(); statistics.getDecimalStatistics().ifPresent(decimalStatistics -> { decimalStatistics.getMin().ifPresent(value -> data.setLowValue(toMetastoreDecimal(value))); decimalStatistics.getMax().ifPresent(value -> data.setHighValue(toMetastoreDecimal(value))); }); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); return new ColumnStatisticsObj(columnName, columnType.toString(), decimalStats(data)); }
private static ColumnStatisticsObj createDateStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) { DateColumnStatsData data = new DateColumnStatsData(); statistics.getDateStatistics().ifPresent(dateStatistics -> { dateStatistics.getMin().ifPresent(value -> data.setLowValue(toMetastoreDate(value))); dateStatistics.getMax().ifPresent(value -> data.setHighValue(toMetastoreDate(value))); }); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); return new ColumnStatisticsObj(columnName, columnType.toString(), dateStats(data)); }
@Test public void testStringStatsToColumnStatistics() { StringColumnStatsData stringColumnStatsData = new StringColumnStatsData(); stringColumnStatsData.setMaxColLen(100); stringColumnStatsData.setAvgColLen(23.333); stringColumnStatsData.setNumNulls(1); stringColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(stringColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(2)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(23)); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); }
@Test public void testBinaryStatsToColumnStatistics() { BinaryColumnStatsData binaryColumnStatsData = new BinaryColumnStatsData(); binaryColumnStatsData.setMaxColLen(100); binaryColumnStatsData.setAvgColLen(22.2); binaryColumnStatsData.setNumNulls(2); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(binaryColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(4)); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(44)); assertEquals(actual.getNullsCount(), OptionalLong.of(2)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testSingleDistinctValue() { DoubleColumnStatsData doubleColumnStatsData = new DoubleColumnStatsData(); doubleColumnStatsData.setNumNulls(10); doubleColumnStatsData.setNumDVs(1); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(10)); assertEquals(actual.getNullsCount(), OptionalLong.of(10)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(0)); doubleColumnStatsData = new DoubleColumnStatsData(); doubleColumnStatsData.setNumNulls(10); doubleColumnStatsData.setNumDVs(1); columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(11)); assertEquals(actual.getNullsCount(), OptionalLong.of(10)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); }
@Test public void testEmptyDateStatsToColumnStatistics() { DateColumnStatsData emptyDateColumnStatsData = new DateColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(emptyDateColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.of(new DateStatistics(Optional.empty(), Optional.empty()))); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyStringColumnStatsData() { StringColumnStatsData emptyStringColumnStatsData = new StringColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(emptyStringColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyDecimalStatsToColumnStatistics() { DecimalColumnStatsData emptyDecimalColumnStatsData = new DecimalColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(emptyDecimalColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.of(new DecimalStatistics(Optional.empty(), Optional.empty()))); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testLongStatsToColumnStatistics() { LongColumnStatsData longColumnStatsData = new LongColumnStatsData(); longColumnStatsData.setLowValue(0); longColumnStatsData.setHighValue(100); longColumnStatsData.setNumNulls(1); longColumnStatsData.setNumDVs(20); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(longColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(100)))); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(1)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); }
@Test public void testEmptyBinaryStatsToColumnStatistics() { BinaryColumnStatsData emptyBinaryColumnStatsData = new BinaryColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(emptyBinaryColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
private Builder(HiveColumnStatistics other) { this.integerStatistics = other.getIntegerStatistics(); this.doubleStatistics = other.getDoubleStatistics(); this.decimalStatistics = other.getDecimalStatistics(); this.dateStatistics = other.getDateStatistics(); this.booleanStatistics = other.getBooleanStatistics(); this.maxValueSizeInBytes = other.getMaxValueSizeInBytes(); this.totalSizeInBytes = other.getTotalSizeInBytes(); this.nullsCount = other.getNullsCount(); this.distinctValuesCount = other.getDistinctValuesCount(); }
@Test public void testBooleanStatsToColumnStatistics() { BooleanColumnStatsData booleanColumnStatsData = new BooleanColumnStatsData(); booleanColumnStatsData.setNumTrues(100); booleanColumnStatsData.setNumFalses(10); booleanColumnStatsData.setNumNulls(0); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(booleanColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.of(100), OptionalLong.of(10)))); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.of(0)); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyDoubleStatsToColumnStatistics() { DoubleColumnStatsData emptyDoubleColumnStatsData = new DoubleColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(emptyDoubleColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty()))); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyBooleanStatsToColumnStatistics() { BooleanColumnStatsData emptyBooleanColumnStatsData = new BooleanColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(emptyBooleanColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.empty()); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty()))); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
@Test public void testEmptyLongStatsToColumnStatistics() { LongColumnStatsData emptyLongColumnStatsData = new LongColumnStatsData(); ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(emptyLongColumnStatsData)); HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty()))); assertEquals(actual.getDoubleStatistics(), Optional.empty()); assertEquals(actual.getDecimalStatistics(), Optional.empty()); assertEquals(actual.getDateStatistics(), Optional.empty()); assertEquals(actual.getBooleanStatistics(), Optional.empty()); assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); assertEquals(actual.getNullsCount(), OptionalLong.empty()); assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); }
private static ColumnStatisticsObj createDoubleStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) { DoubleColumnStatsData data = new DoubleColumnStatsData(); statistics.getDoubleStatistics().ifPresent(doubleStatistics -> { doubleStatistics.getMin().ifPresent(data::setLowValue); doubleStatistics.getMax().ifPresent(data::setHighValue); }); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); return new ColumnStatisticsObj(columnName, columnType.toString(), doubleStats(data)); }
private static ColumnStatisticsObj createLongStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) { LongColumnStatsData data = new LongColumnStatsData(); statistics.getIntegerStatistics().ifPresent(integerStatistics -> { integerStatistics.getMin().ifPresent(data::setLowValue); integerStatistics.getMax().ifPresent(data::setHighValue); }); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); return new ColumnStatisticsObj(columnName, columnType.toString(), longStats(data)); }
public static HiveColumnStatistics merge(HiveColumnStatistics first, HiveColumnStatistics second) { return new HiveColumnStatistics( mergeIntegerStatistics(first.getIntegerStatistics(), second.getIntegerStatistics()), mergeDoubleStatistics(first.getDoubleStatistics(), second.getDoubleStatistics()), mergeDecimalStatistics(first.getDecimalStatistics(), second.getDecimalStatistics()), mergeDateStatistics(first.getDateStatistics(), second.getDateStatistics()), mergeBooleanStatistics(first.getBooleanStatistics(), second.getBooleanStatistics()), reduce(first.getMaxValueSizeInBytes(), second.getMaxValueSizeInBytes(), MAX, true), reduce(first.getTotalSizeInBytes(), second.getTotalSizeInBytes(), ADD, true), reduce(first.getNullsCount(), second.getNullsCount(), ADD, false), reduce(first.getDistinctValuesCount(), second.getDistinctValuesCount(), MAX, false)); }
private static ColumnStatisticsObj createStringStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics, OptionalLong rowCount) { StringColumnStatsData data = new StringColumnStatsData(); statistics.getNullsCount().ifPresent(data::setNumNulls); toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); data.setMaxColLen(statistics.getMaxValueSizeInBytes().orElse(0)); data.setAvgColLen(getAverageColumnLength(statistics.getTotalSizeInBytes(), rowCount, statistics.getNullsCount()).orElse(0)); return new ColumnStatisticsObj(columnName, columnType.toString(), stringStats(data)); }