org.pentaho.di.core.util.StringEvaluator.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.di.core.util.StringEvaluator.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.core.util;

import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.ValueMeta;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaBoolean;
import org.pentaho.di.core.row.value.ValueMetaDate;
import org.pentaho.di.core.row.value.ValueMetaInteger;
import org.pentaho.di.core.row.value.ValueMetaNumber;
import org.pentaho.di.core.row.value.ValueMetaString;

/**
 * This class evaluates strings and extracts a data type. It allows you to criteria after which the analysis should be
 * completed.
 *
 * @author matt
 */
public class StringEvaluator {

    private Set<String> values;
    private List<StringEvaluationResult> evaluationResults;
    private int maxLength;
    private int maxPrecision;
    private int count;
    private boolean tryTrimming;

    private ValueMetaInterface stringMeta;

    private String[] dateFormats;
    private String[] numberFormats;

    private static final String[] DEFAULT_NUMBER_FORMATS = new String[] { "#,###,###.#", "#.#", "#", "#.0", "#.00",
            "#.000", "#.0000", "#.00000", "#.000000", " #.0#" };

    protected static final Pattern PRECISION_PATTERN = Pattern.compile("[^0-9#]");

    public StringEvaluator() {
        this(true);
    }

    public StringEvaluator(boolean tryTrimming) {
        this(tryTrimming, DEFAULT_NUMBER_FORMATS, Const.getDateFormats());
    }

    public StringEvaluator(boolean tryTrimming, List<String> numberFormats, List<String> dateFormats) {
        this(tryTrimming, numberFormats.toArray(new String[numberFormats.size()]),
                dateFormats.toArray(new String[dateFormats.size()]));
    }

    public StringEvaluator(boolean tryTrimming, String[] numberFormats, String[] dateFormats) {
        this.tryTrimming = tryTrimming;

        values = new HashSet<String>();
        evaluationResults = new ArrayList<StringEvaluationResult>();
        count = 0;

        stringMeta = new ValueMetaString("string");
        this.numberFormats = numberFormats;
        this.dateFormats = dateFormats;

        populateConversionMetaList();
    }

    public void evaluateString(String value) {
        count++;

        if (!values.contains(value)) {
            values.add(value);

            if (value != null) {
                evaluateLength(value);
                evaluatePrecision(value);
                challengeConversions(value);
            }
        }
    }

    private void challengeConversions(String value) {
        List<StringEvaluationResult> all = new ArrayList<StringEvaluationResult>(evaluationResults);
        ValueMetaInterface stringMetaClone = null;
        for (StringEvaluationResult cmm : all) {
            if (cmm.getConversionMeta().isBoolean()) {
                // Boolean conversion never fails.
                // If it's a Y, N, true, false it's a boolean otherwise it ain't.
                //
                String string;
                if (tryTrimming) {
                    string = Const.trim(value);
                } else {
                    string = value;
                }
                if (StringUtils.isEmpty(value)) {
                    cmm.incrementNrNull();
                } else if (!("Y".equalsIgnoreCase(string) || "N".equalsIgnoreCase(string)
                        || "TRUE".equalsIgnoreCase(string) || "FALSE".equalsIgnoreCase(string))) {
                    evaluationResults.remove(cmm);
                } else {
                    cmm.incrementSuccesses();
                }
            } else if (cmm.getConversionMeta().isDate()) {
                String dateFormat = cmm.getConversionMeta().getConversionMask();
                if (!DateDetector.isValidDateFormatToStringDate(dateFormat, value, "en_US")) {
                    evaluationResults.remove(cmm);
                } else {
                    try {
                        Object object = DateDetector.getDateFromStringByFormat(value, dateFormat);
                        cmm.incrementSuccesses();
                        if (cmm.getMin() == null || cmm.getConversionMeta().compare(cmm.getMin(), object) > 0) {
                            cmm.setMin(object);
                        }
                        if (cmm.getMax() == null || cmm.getConversionMeta().compare(cmm.getMax(), object) < 0) {
                            cmm.setMax(object);
                        }
                    } catch (ParseException e) {
                        evaluationResults.remove(cmm);
                    } catch (KettleValueException e) {
                        evaluationResults.remove(cmm);
                    }
                }
            } else {
                try {
                    if (cmm.getConversionMeta().isNumeric()) {
                        boolean stop = false;
                        int nrDots = 0;
                        int nrCommas = 0;
                        int pos = 0;
                        for (char c : value.toCharArray()) {

                            boolean currencySymbolMatch = !String.valueOf(c)
                                    .equals(cmm.getConversionMeta().getCurrencySymbol()) && c != '(' && c != ')';

                            if (!Character.isDigit(c) && c != '.' && c != ',' && !Character.isSpaceChar(c)
                                    && currencySymbolMatch && (pos > 0 && (c == '+' || c == '-')) // allow + & - at the 1st position
                            ) {
                                evaluationResults.remove(cmm);
                                stop = true;
                                break;
                            }

                            // If the value contains a decimal or grouping symbol or some sort, it's not an integer
                            //
                            if ((c == '.' && cmm.getConversionMeta().isInteger())
                                    || (c == ',' && cmm.getConversionMeta().isInteger())) {
                                evaluationResults.remove(cmm);
                                stop = true;
                                break;
                            }
                            if (c == '.') {
                                nrDots++;
                            }
                            if (c == ',') {
                                nrCommas++;
                            }
                            pos++;
                        }

                        if (nrDots > 1 && nrCommas > 1) {
                            evaluationResults.remove(cmm);
                            stop = true;
                        }

                        if (stop) {
                            continue;
                        }

                    }

                    if (stringMetaClone == null) {
                        // avoid cloning each time
                        stringMetaClone = stringMeta.clone();
                    }
                    stringMetaClone.setConversionMetadata(cmm.getConversionMeta());
                    stringMetaClone.setTrimType(cmm.getConversionMeta().getTrimType());
                    Object object = stringMetaClone.convertDataUsingConversionMetaData(value);

                    // Still here? Evaluate the data...
                    // Keep track of null values, min, max, etc.
                    //
                    if (cmm.getConversionMeta().isNull(object)) {
                        cmm.incrementNrNull();
                    } else {
                        cmm.incrementSuccesses();
                    }
                    if (cmm.getMin() == null || cmm.getConversionMeta().compare(cmm.getMin(), object) > 0) {
                        cmm.setMin(object);
                    }
                    if (cmm.getMax() == null || cmm.getConversionMeta().compare(cmm.getMax(), object) < 0) {
                        cmm.setMax(object);
                    }
                } catch (KettleValueException e) {
                    // This one doesn't work, remove it from the list!
                    //
                    evaluationResults.remove(cmm);
                }
            }
        }
    }

    private void evaluateLength(String value) {
        if (value.length() > maxLength) {
            maxLength = value.length();
        }
    }

    private void evaluatePrecision(String value) {
        int p = determinePrecision(value);
        if (p > maxPrecision) {
            maxPrecision = p;
        }
    }

    private boolean containsInteger() {
        for (StringEvaluationResult result : evaluationResults) {
            if (result.getConversionMeta().isInteger() && result.getNrSuccesses() > 0) {
                return true;
            }
        }
        return false;
    }

    private boolean containsNumber() {
        for (StringEvaluationResult result : evaluationResults) {
            if (result.getConversionMeta().isNumber() && result.getNrSuccesses() > 0) {
                return true;
            }
        }
        return false;
    }

    private boolean containsDate() {
        for (StringEvaluationResult result : evaluationResults) {
            if (result.getConversionMeta().isDate() && result.getNrSuccesses() > 0) {
                return true;
            }
        }
        return false;
    }

    public StringEvaluationResult getAdvicedResult() {
        if (evaluationResults.isEmpty()) {
            ValueMetaInterface adviced = new ValueMetaString("adviced");
            adviced.setLength(maxLength);
            int nrNulls = 0;
            String min = null;
            String max = null;
            for (String string : values) {
                if (string != null) {
                    if (min == null || min.compareTo(string) > 0) {
                        min = string;
                    }
                    if (max == null || max.compareTo(string) < 0) {
                        max = string;
                    }
                } else {
                    nrNulls++;
                }
            }

            StringEvaluationResult result = new StringEvaluationResult(adviced);
            result.setNrNull(nrNulls);
            result.setMin(min);
            result.setMax(max);
            return result;

        } else {
            // If there are Numbers and Integers, pick the integers...
            //
            if (containsInteger() && containsNumber()) {
                for (Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator
                        .hasNext();) {
                    StringEvaluationResult result = iterator.next();
                    if (maxPrecision == 0 && result.getConversionMeta().isNumber()) {
                        // no precision, don't bother with a number
                        iterator.remove();
                    } else if (maxPrecision > 0 && result.getConversionMeta().isInteger()) {
                        // precision is needed, can't use integer
                        iterator.remove();
                    }
                }
            }
            // If there are Dates and Integers, pick the dates...
            //
            if (containsInteger() && containsDate()) {
                for (Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator
                        .hasNext();) {
                    StringEvaluationResult result = iterator.next();
                    if (result.getConversionMeta().isInteger()) {
                        iterator.remove();
                    }
                }
            }

            Comparator<StringEvaluationResult> compare = null;
            if (containsDate()) {
                // want the longest format for dates
                compare = new Comparator<StringEvaluationResult>() {
                    @Override
                    public int compare(StringEvaluationResult r1, StringEvaluationResult r2) {
                        Integer length1 = r1.getConversionMeta().getConversionMask() == null ? 0
                                : r1.getConversionMeta().getConversionMask().length();
                        Integer length2 = r2.getConversionMeta().getConversionMask() == null ? 0
                                : r2.getConversionMeta().getConversionMask().length();
                        return length2.compareTo(length1);
                    }
                };
            } else {
                // want the shortest format mask for numerics & integers
                compare = new Comparator<StringEvaluationResult>() {
                    @Override
                    public int compare(StringEvaluationResult r1, StringEvaluationResult r2) {
                        Integer length1 = r1.getConversionMeta().getConversionMask() == null ? 0
                                : r1.getConversionMeta().getConversionMask().length();
                        Integer length2 = r2.getConversionMeta().getConversionMask() == null ? 0
                                : r2.getConversionMeta().getConversionMask().length();
                        return length1.compareTo(length2);
                    }
                };
            }

            Collections.sort(evaluationResults, compare);

            StringEvaluationResult result = evaluationResults.get(0);
            ValueMetaInterface conversionMeta = result.getConversionMeta();
            if (conversionMeta.isNumber() && conversionMeta.getCurrencySymbol() == null) {
                conversionMeta.setPrecision(maxPrecision);
                if (maxPrecision > 0 && maxLength > 0) {
                    conversionMeta.setLength(maxLength);
                }
            }

            return result;
        }

    }

    public String[] getDateFormats() {
        return dateFormats;
    }

    public String[] getNumberFormats() {
        return numberFormats;
    }

    private void populateConversionMetaList() {

        int[] trimTypes;
        if (tryTrimming) {
            trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, ValueMetaInterface.TRIM_TYPE_BOTH, };
        } else {
            trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, };
        }

        for (int trimType : trimTypes) {
            for (String format : getDateFormats()) {
                ValueMetaInterface conversionMeta = new ValueMetaDate("date");
                conversionMeta.setConversionMask(format);
                conversionMeta.setTrimType(trimType);
                conversionMeta.setDateFormatLenient(false);
                evaluationResults.add(new StringEvaluationResult(conversionMeta));
            }

            EvalResultBuilder numberUsBuilder = new EvalResultBuilder("number-us", ValueMetaInterface.TYPE_NUMBER,
                    15, trimType, ".", ",");
            EvalResultBuilder numberEuBuilder = new EvalResultBuilder("number-eu", ValueMetaInterface.TYPE_NUMBER,
                    15, trimType, ",", ".");

            for (String format : getNumberFormats()) {

                if (format.equals("#") || format.equals("0")) {
                    // skip the integer ones. we'll get those later
                    continue;
                }

                int precision = determinePrecision(format);
                evaluationResults.add(numberUsBuilder.format(format, precision).build());
                evaluationResults.add(numberEuBuilder.format(format, precision).build());
            }

            // Try the locale's Currency
            DecimalFormat currencyFormat = ((DecimalFormat) NumberFormat.getCurrencyInstance());

            ValueMetaInterface conversionMeta = new ValueMetaNumber("number-currency");
            // replace the universal currency symbol with the locale's currency symbol for user recognition
            String currencyMask = currencyFormat.toLocalizedPattern().replace("\u00A4",
                    currencyFormat.getCurrency().getSymbol());
            conversionMeta.setConversionMask(currencyMask);
            conversionMeta.setTrimType(trimType);
            conversionMeta.setDecimalSymbol(
                    String.valueOf(currencyFormat.getDecimalFormatSymbols().getDecimalSeparator()));
            conversionMeta.setGroupingSymbol(
                    String.valueOf(currencyFormat.getDecimalFormatSymbols().getGroupingSeparator()));
            conversionMeta.setCurrencySymbol(currencyFormat.getCurrency().getSymbol());
            conversionMeta.setLength(15);
            int currencyPrecision = currencyFormat.getCurrency().getDefaultFractionDigits();
            conversionMeta.setPrecision(currencyPrecision);

            evaluationResults.add(new StringEvaluationResult(conversionMeta));

            // add same mask w/o currency symbol
            String currencyMaskAsNumeric = currencyMask
                    .replaceAll(Pattern.quote(currencyFormat.getCurrency().getSymbol()), "");
            evaluationResults.add(numberUsBuilder.format(currencyMaskAsNumeric, currencyPrecision).build());
            evaluationResults.add(numberEuBuilder.format(currencyMaskAsNumeric, currencyPrecision).build());

            // Integer
            //
            conversionMeta = new ValueMetaInteger("integer");
            conversionMeta.setConversionMask("#");
            conversionMeta.setLength(15);
            evaluationResults.add(new StringEvaluationResult(conversionMeta));

            conversionMeta = new ValueMetaInteger("integer");
            conversionMeta.setConversionMask(" #");
            conversionMeta.setLength(15);
            evaluationResults.add(new StringEvaluationResult(conversionMeta));

            // Add support for left zero padded integers
            //
            for (int i = 1; i <= 15; i++) {

                String mask = " ";
                for (int x = 0; x < i; x++) {
                    mask += "0";
                }
                mask += ";-";
                for (int x = 0; x < i; x++) {
                    mask += "0";
                }

                conversionMeta = new ValueMetaInteger("integer-zero-padded-" + i);
                conversionMeta.setConversionMask(mask);
                conversionMeta.setLength(i);
                evaluationResults.add(new StringEvaluationResult(conversionMeta));
            }

            // Boolean
            //
            conversionMeta = new ValueMetaBoolean("boolean");
            evaluationResults.add(new StringEvaluationResult(conversionMeta));
        }
    }

    protected static int determinePrecision(String numericFormat) {
        if (numericFormat != null) {
            char decimalSymbol = ((DecimalFormat) NumberFormat.getInstance()).getDecimalFormatSymbols()
                    .getDecimalSeparator();
            int loc = numericFormat.lastIndexOf(decimalSymbol);
            if (loc >= 0 && loc < numericFormat.length()) {
                Matcher m = PRECISION_PATTERN.matcher(numericFormat.substring(loc + 1));
                int nonDigitLoc = numericFormat.length();
                if (m.find()) {
                    nonDigitLoc = loc + 1 + m.start();
                }
                return numericFormat.substring(loc + 1, nonDigitLoc).length();
            } else {
                return 0;
            }
        } else {
            return 0;
        }
    }

    /**
     * @return The distinct set of string values
     */
    public Set<String> getValues() {
        return values;
    }

    /**
     * PDI-7736: Only list of successful evaluations returned.
     * 
     * @return The list of string evaluation results
     */
    public List<StringEvaluationResult> getStringEvaluationResults() {
        List<StringEvaluationResult> result = new ArrayList<>();
        for (StringEvaluationResult ev : evaluationResults) {
            if (ev.getNrSuccesses() > 0) {
                result.add(ev);
            }
        }
        return result;
    }

    /**
     * @return the number of values analyzed
     */
    public int getCount() {
        return count;
    }

    /**
     * @return The maximum string length encountered
     */
    public int getMaxLength() {
        return maxLength;
    }

    private static class EvalResultBuilder {
        private final String name;
        private final int type;
        private final int length;
        private final int trimType;
        private final String decimalSymbol;
        private final String groupingSymbol;

        private String format;
        private int precision;

        public StringEvaluationResult build() {
            ValueMetaInterface meta = new ValueMeta(name, type);
            meta.setConversionMask(format);
            meta.setTrimType(trimType);
            meta.setDecimalSymbol(decimalSymbol);
            meta.setGroupingSymbol(groupingSymbol);
            meta.setLength(length);
            meta.setPrecision(precision);
            return new StringEvaluationResult(meta);
        }

        public EvalResultBuilder(String name, int type, int length, int trimType, String decimalSymbol,
                String groupingSymbol) {
            this.name = name;
            this.type = type;
            this.length = length;
            this.trimType = trimType;
            this.decimalSymbol = decimalSymbol;
            this.groupingSymbol = groupingSymbol;
        }

        public EvalResultBuilder format(String format, int precision) {
            this.format = format;
            this.precision = precision;
            return this;
        }
    }
}