org.talend.dataprep.dataset.StatisticsAdapter.java Source code

Java tutorial

Introduction

Here is the source code for org.talend.dataprep.dataset.StatisticsAdapter.java

Source

// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// https://github.com/Talend/data-prep/blob/master/LICENSE
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================

package org.talend.dataprep.dataset;

import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataprep.api.dataset.ColumnMetadata;
import org.talend.dataprep.api.dataset.Quality;
import org.talend.dataprep.api.dataset.statistics.*;
import org.talend.dataprep.api.dataset.statistics.date.DateHistogram;
import org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics;
import org.talend.dataprep.api.dataset.statistics.number.NumberHistogram;
import org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramStatistics;
import org.talend.dataprep.api.type.Type;
import org.talend.dataprep.api.type.TypeUtils;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.common.inference.ValueQualityStatistics;
import org.talend.dataquality.semantic.api.CategoryRegistryManager;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.recognizer.CategoryFrequency;
import org.talend.dataquality.semantic.statistics.SemanticType;
import org.talend.dataquality.statistics.cardinality.CardinalityStatistics;
import org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics;
import org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics;
import org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics;
import org.talend.dataquality.statistics.numeric.summary.SummaryStatistics;
import org.talend.dataquality.statistics.text.TextLengthStatistics;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.DataTypeOccurences;

import static java.util.Locale.ENGLISH;
import static org.talend.dataprep.api.type.Type.*;

/**
 * Statistics adapter. This is used to inject every statistics part in the columns metadata.
 */
public class StatisticsAdapter {

    private static final Logger LOGGER = LoggerFactory.getLogger(StatisticsAdapter.class);

    /**
     * Defines the minimum threshold for a semantic type suggestion. Defaults to 40% if not defined.
     */
    private final int semanticThreshold;

    public StatisticsAdapter(int semanticThreshold) {
        this.semanticThreshold = semanticThreshold;
    }

    /**
     * Extract analysis result and inject them in columns metadata
     *
     * @param columns The columns metadata
     * @param results The analysis results
     * @see #adapt(List, List, Predicate) to filter out columns during extraction of results.
     */
    public void adapt(List<ColumnMetadata> columns, List<Analyzers.Result> results) {
        adapt(columns, results, c -> true);
    }

    /**
     * Extract analysis result and inject them in columns metadata
     *
     * @param columns The columns metadata
     * @param results The analysis results
     * @param filter A {@link Predicate predicate} to filter columns to adapt.
     */
    public void adapt(List<ColumnMetadata> columns, List<Analyzers.Result> results,
            Predicate<ColumnMetadata> filter) {
        genericAdapt(columns, results, filter);
    }

    /**
     * Extract analysis result and inject them in columns metadata
     *
     * @param columns The columns metadata
     * @param results The analysis results
     * @param filter A {@link Predicate predicate} to filter columns to adapt.
     */
    private void genericAdapt(List<ColumnMetadata> columns, List<Analyzers.Result> results,
            Predicate<ColumnMetadata> filter) {
        final Iterator<Analyzers.Result> resultIterator = results.iterator();
        columns.stream().filter(filter).forEach(c -> {
            if (resultIterator.hasNext()) {
                final Analyzers.Result result = resultIterator.next();
                injectDataTypeAnalysis(c, result);
                adaptCommonAnalysis(c, result);
            }
        });
    }

    /**
     * Extracts remaining analysis result (other than data type) and inject them in the specified column metadata.
     *
     * @param currentColumn the specified column metadata
     * @param result the specified Analysis result
     */
    private void adaptCommonAnalysis(final ColumnMetadata currentColumn, final Analyzers.Result result) {
        injectSemanticTypes(currentColumn, result);
        injectCardinality(currentColumn, result); // distinct + duplicates
        injectDataFrequency(currentColumn, result);
        injectPatternFrequency(currentColumn, result);
        injectQuantile(currentColumn, result);
        injectNumberSummary(currentColumn, result); // min, max, mean, variance
        injectTextLength(currentColumn, result);
        injectNumberHistogram(currentColumn, result);
        injectDateHistogram(currentColumn, result);
    }

    private void injectDataTypeAnalysis(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(DataTypeOccurences.class) && !column.isTypeForced()) {
            final DataTypeOccurences dataType = result.get(DataTypeOccurences.class);
            final DataTypeEnum suggestedEnumType = dataType.getSuggestedType();
            final Type suggestedColumnType = Type.get(suggestedEnumType.name());

            // the suggested type can be modified by #injectValueQuality
            column.setType(suggestedColumnType.getName());
        }
        injectValueQuality(column, result);
    }

    private void injectValueQuality(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(ValueQualityStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final Quality quality = column.getQuality();
            final ValueQualityStatistics valueQualityStatistics = result.get(ValueQualityStatistics.class);

            final long allCount = valueQualityStatistics.getCount();
            final long emptyCount = valueQualityStatistics.getEmptyCount();
            final long validCount = valueQualityStatistics.getValidCount();
            final long invalidCount = allCount - emptyCount - validCount;

            // Set in column quality...
            quality.setEmpty((int) emptyCount);
            quality.setValid((int) validCount);
            quality.setInvalid((int) invalidCount);
            // ... and statistics
            statistics.setCount(allCount);
            statistics.setEmpty((int) emptyCount);
            statistics.setInvalid((int) invalidCount);
            statistics.setValid(validCount);
        }
    }

    private void injectSemanticTypes(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(SemanticType.class) && !column.isDomainForced()) {
            final SemanticType semanticType = result.get(SemanticType.class);
            final Map<CategoryFrequency, Long> foundSemanticTypes = semanticType.getCategoryToCount();
            // TDP-471: Don't pick semantic type if lower than a threshold.
            final Optional<Map.Entry<CategoryFrequency, Long>> entry = foundSemanticTypes.entrySet().stream()
                    .filter(e -> !e.getKey().getCategoryName().isEmpty())
                    .max((o1, o2) -> (o1.getKey().compareTo(o2.getKey())));
            if (entry.isPresent()) {
                // TODO (TDP-734) Take into account limit of the semantic analyzer.
                final float percentage = entry.get().getKey().getFrequency();
                if (percentage > semanticThreshold) {
                    final CategoryFrequency key = entry.get().getKey();
                    final String categoryId = key.getCategoryId();
                    DQCategory categoryMetadataByName = CategoryRegistryManager.getInstance()
                            .getCategoryMetadataByName(categoryId);
                    if (categoryMetadataByName == null) {
                        LOGGER.error("Could not find {} in known categories.", categoryId);
                        column.setDomain(categoryId);
                        column.setDomainLabel(categoryId);
                    } else {
                        column.setDomain(categoryMetadataByName.getName());
                        column.setDomainLabel(categoryMetadataByName.getLabel());
                    }
                    column.setDomainFrequency(percentage);
                } else {
                    // Ensure the domain is cleared if percentage is lower than threshold (earlier analysis - e.g.
                    // on the first 20 lines - may be over threshold, but full scan may decide otherwise.
                    column.setDomain(StringUtils.EMPTY);
                    column.setDomainLabel(StringUtils.EMPTY);
                    column.setDomainFrequency(0);
                }
            } else if (!StringUtils.isEmpty(column.getDomain())) {
                // Column *had* a domain but seems like new analysis removed it.
                column.setDomain(StringUtils.EMPTY);
                column.setDomainLabel(StringUtils.EMPTY);
                column.setDomainFrequency(0);
            }
            // Remembers all suggested semantic categories
            List<SemanticDomain> semanticDomains = semanticType.getCategoryToCount().entrySet().stream()
                    .map(current -> { //
                        // Find category display name
                        final String id = current.getKey().getCategoryId();
                        if (!StringUtils.isEmpty(id)) {
                            // Takes only actual semantic domains (unknown = "").
                            final String categoryDisplayName = TypeUtils.getDomainLabel(id);
                            return new SemanticDomain(id, categoryDisplayName, current.getKey().getFrequency());
                        } else {
                            return null;
                        }
                    }) //
                    .filter(semanticDomain -> semanticDomain != null && semanticDomain.getFrequency() >= 1) //
                    .limit(10) //
                    .collect(Collectors.toList());
            column.setSemanticDomains(semanticDomains);
        }
    }

    private void injectCardinality(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(CardinalityStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final CardinalityStatistics cardinalityStatistics = result.get(CardinalityStatistics.class);
            statistics.setDistinctCount(cardinalityStatistics.getDistinctCount());
            statistics.setDuplicateCount(cardinalityStatistics.getDuplicateCount());
        }
    }

    private void injectDataFrequency(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(DataTypeFrequencyStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final DataTypeFrequencyStatistics dataFrequencyStatistics = result
                    .get(DataTypeFrequencyStatistics.class);
            final Map<String, Long> topTerms = dataFrequencyStatistics.getTopK(15);
            if (topTerms != null) {
                statistics.getDataFrequencies().clear();
                topTerms.forEach((s, o) -> statistics.getDataFrequencies().add(new DataFrequency(s, o)));
            }
        }
    }

    private void injectPatternFrequency(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(PatternFrequencyStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final PatternFrequencyStatistics patternFrequencyStatistics = result
                    .get(PatternFrequencyStatistics.class);
            final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(15);
            if (topTerms != null) {
                statistics.getPatternFrequencies().clear();
                topTerms.forEach((s, o) -> statistics.getPatternFrequencies().add(new PatternFrequency(s, o)));
            }
        }
    }

    private void injectQuantile(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(QuantileStatistics.class)) {
            try {
                final QuantileStatistics quantileStatistics = result.get(QuantileStatistics.class);
                quantileStatistics.endAddValue();
                final Quantiles quantiles = column.getStatistics().getQuantiles();
                quantiles.setLowerQuantile(quantileStatistics.getLowerQuartile());
                quantiles.setMedian(quantileStatistics.getMedian());
                quantiles.setUpperQuantile(quantileStatistics.getUpperQuartile());
            } catch (Exception e) {
                LOGGER.warn("Unable to inject quantile information in column {}.", column.getName());
                LOGGER.debug("Unable to inject quantile information in column {}.", e);
            }
        }
    }

    /**
     * Injects numerical statistics like max, min to statistics of the specified column metadata.
     *
     * For columns of type date, min and max values are retrieved from the date histogram
     *
     * @param column the specified column metadata
     * @param result the analyzer result
     */
    private void injectNumberSummary(final ColumnMetadata column, final Analyzers.Result result) {
        if (result.exist(SummaryStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final SummaryStatistics summaryStatistics = result.get(SummaryStatistics.class);
            statistics.setMean(summaryStatistics.getMean());
            statistics.setVariance(summaryStatistics.getVariance());
            // if the column is of type Date
            if (DATE.isAssignableFrom(column.getType()) && result.exist(StreamDateHistogramStatistics.class)) {
                final DateHistogram histogram = (DateHistogram) result.get(StreamDateHistogramStatistics.class)
                        .getHistogram();
                statistics.setMax(histogram.getMaxUTCEpochMilliseconds());
                statistics.setMin(histogram.getMinUTCEpochMilliseconds());
            } else {
                statistics.setMax(summaryStatistics.getMax());
                statistics.setMin(summaryStatistics.getMin());
            }
        }
    }

    private void injectNumberHistogram(final ColumnMetadata column, final Analyzers.Result result) {
        if (NUMERIC.isAssignableFrom(column.getType()) && result.exist(StreamNumberHistogramStatistics.class)) {
            final Statistics statistics = column.getStatistics();
            final Map<org.talend.dataquality.statistics.numeric.histogram.Range, Long> histogramStatistics = result
                    .get(StreamNumberHistogramStatistics.class).getHistogram();
            final NumberFormat format = DecimalFormat.getInstance(ENGLISH);

            // Set histogram ranges
            final Histogram histogram = new NumberHistogram();
            histogramStatistics.forEach((rangeValues, occurrence) -> {
                final HistogramRange range = new HistogramRange();
                try {
                    range.getRange().setMax(new Double(format.format(rangeValues.getUpper())));
                    range.getRange().setMin(new Double(format.format(rangeValues.getLower())));
                } catch (NumberFormatException e) {
                    // Fallback to non formatted numbers (unable to parse numbers).
                    range.getRange().setMax(rangeValues.getUpper());
                    range.getRange().setMin(rangeValues.getLower());
                }
                range.setOccurrences(occurrence);
                histogram.getItems().add(range);
            });
            statistics.setHistogram(histogram);
        }
    }

    private void injectDateHistogram(final ColumnMetadata column, final Analyzers.Result result) {
        if (DATE.isAssignableFrom(column.getType()) && result.exist(StreamDateHistogramStatistics.class)) {
            final Histogram histogram = result.get(StreamDateHistogramStatistics.class).getHistogram();
            column.getStatistics().setHistogram(histogram);
        }
    }

    private void injectTextLength(final ColumnMetadata column, final Analyzers.Result result) {
        if (STRING.equals(Type.get(column.getType())) && result.exist(TextLengthStatistics.class)) {
            final TextLengthStatistics textLengthStatistics = result.get(TextLengthStatistics.class);
            final TextLengthSummary textLengthSummary = column.getStatistics().getTextLengthSummary();
            textLengthSummary.setAverageLength(textLengthStatistics.getAvgTextLength());
            textLengthSummary.setMinimalLength(textLengthStatistics.getMinTextLength());
            textLengthSummary.setMaximalLength(textLengthStatistics.getMaxTextLength());
        }
    }

}