Java tutorial
/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.core.util; import java.text.DecimalFormat; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.pentaho.di.core.Const; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.row.ValueMeta; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.row.value.ValueMetaBoolean; import org.pentaho.di.core.row.value.ValueMetaDate; import org.pentaho.di.core.row.value.ValueMetaInteger; import org.pentaho.di.core.row.value.ValueMetaNumber; import org.pentaho.di.core.row.value.ValueMetaString; /** * This class evaluates strings and extracts a data type. It allows you to criteria after which the analysis should be * completed. * * @author matt */ public class StringEvaluator { private Set<String> values; private List<StringEvaluationResult> evaluationResults; private int maxLength; private int maxPrecision; private int count; private boolean tryTrimming; private ValueMetaInterface stringMeta; private String[] dateFormats; private String[] numberFormats; private static final String[] DEFAULT_NUMBER_FORMATS = new String[] { "#,###,###.#", "#.#", "#", "#.0", "#.00", "#.000", "#.0000", "#.00000", "#.000000", " #.0#" }; protected static final Pattern PRECISION_PATTERN = Pattern.compile("[^0-9#]"); public StringEvaluator() { this(true); } public StringEvaluator(boolean tryTrimming) { this(tryTrimming, DEFAULT_NUMBER_FORMATS, Const.getDateFormats()); } public StringEvaluator(boolean tryTrimming, List<String> numberFormats, List<String> dateFormats) { this(tryTrimming, numberFormats.toArray(new String[numberFormats.size()]), dateFormats.toArray(new String[dateFormats.size()])); } public StringEvaluator(boolean tryTrimming, String[] numberFormats, String[] dateFormats) { this.tryTrimming = tryTrimming; values = new HashSet<String>(); evaluationResults = new ArrayList<StringEvaluationResult>(); count = 0; stringMeta = new ValueMetaString("string"); this.numberFormats = numberFormats; this.dateFormats = dateFormats; populateConversionMetaList(); } public void evaluateString(String value) { count++; if (!values.contains(value)) { values.add(value); if (value != null) { evaluateLength(value); evaluatePrecision(value); challengeConversions(value); } } } private void challengeConversions(String value) { List<StringEvaluationResult> all = new ArrayList<StringEvaluationResult>(evaluationResults); ValueMetaInterface stringMetaClone = null; for (StringEvaluationResult cmm : all) { if (cmm.getConversionMeta().isBoolean()) { // Boolean conversion never fails. // If it's a Y, N, true, false it's a boolean otherwise it ain't. // String string; if (tryTrimming) { string = Const.trim(value); } else { string = value; } if (StringUtils.isEmpty(value)) { cmm.incrementNrNull(); } else if (!("Y".equalsIgnoreCase(string) || "N".equalsIgnoreCase(string) || "TRUE".equalsIgnoreCase(string) || "FALSE".equalsIgnoreCase(string))) { evaluationResults.remove(cmm); } else { cmm.incrementSuccesses(); } } else if (cmm.getConversionMeta().isDate()) { String dateFormat = cmm.getConversionMeta().getConversionMask(); if (!DateDetector.isValidDateFormatToStringDate(dateFormat, value, "en_US")) { evaluationResults.remove(cmm); } else { try { Object object = DateDetector.getDateFromStringByFormat(value, dateFormat); cmm.incrementSuccesses(); if (cmm.getMin() == null || cmm.getConversionMeta().compare(cmm.getMin(), object) > 0) { cmm.setMin(object); } if (cmm.getMax() == null || cmm.getConversionMeta().compare(cmm.getMax(), object) < 0) { cmm.setMax(object); } } catch (ParseException e) { evaluationResults.remove(cmm); } catch (KettleValueException e) { evaluationResults.remove(cmm); } } } else { try { if (cmm.getConversionMeta().isNumeric()) { boolean stop = false; int nrDots = 0; int nrCommas = 0; int pos = 0; for (char c : value.toCharArray()) { boolean currencySymbolMatch = !String.valueOf(c) .equals(cmm.getConversionMeta().getCurrencySymbol()) && c != '(' && c != ')'; if (!Character.isDigit(c) && c != '.' && c != ',' && !Character.isSpaceChar(c) && currencySymbolMatch && (pos > 0 && (c == '+' || c == '-')) // allow + & - at the 1st position ) { evaluationResults.remove(cmm); stop = true; break; } // If the value contains a decimal or grouping symbol or some sort, it's not an integer // if ((c == '.' && cmm.getConversionMeta().isInteger()) || (c == ',' && cmm.getConversionMeta().isInteger())) { evaluationResults.remove(cmm); stop = true; break; } if (c == '.') { nrDots++; } if (c == ',') { nrCommas++; } pos++; } if (nrDots > 1 && nrCommas > 1) { evaluationResults.remove(cmm); stop = true; } if (stop) { continue; } } if (stringMetaClone == null) { // avoid cloning each time stringMetaClone = stringMeta.clone(); } stringMetaClone.setConversionMetadata(cmm.getConversionMeta()); stringMetaClone.setTrimType(cmm.getConversionMeta().getTrimType()); Object object = stringMetaClone.convertDataUsingConversionMetaData(value); // Still here? Evaluate the data... // Keep track of null values, min, max, etc. // if (cmm.getConversionMeta().isNull(object)) { cmm.incrementNrNull(); } else { cmm.incrementSuccesses(); } if (cmm.getMin() == null || cmm.getConversionMeta().compare(cmm.getMin(), object) > 0) { cmm.setMin(object); } if (cmm.getMax() == null || cmm.getConversionMeta().compare(cmm.getMax(), object) < 0) { cmm.setMax(object); } } catch (KettleValueException e) { // This one doesn't work, remove it from the list! // evaluationResults.remove(cmm); } } } } private void evaluateLength(String value) { if (value.length() > maxLength) { maxLength = value.length(); } } private void evaluatePrecision(String value) { int p = determinePrecision(value); if (p > maxPrecision) { maxPrecision = p; } } private boolean containsInteger() { for (StringEvaluationResult result : evaluationResults) { if (result.getConversionMeta().isInteger() && result.getNrSuccesses() > 0) { return true; } } return false; } private boolean containsNumber() { for (StringEvaluationResult result : evaluationResults) { if (result.getConversionMeta().isNumber() && result.getNrSuccesses() > 0) { return true; } } return false; } private boolean containsDate() { for (StringEvaluationResult result : evaluationResults) { if (result.getConversionMeta().isDate() && result.getNrSuccesses() > 0) { return true; } } return false; } public StringEvaluationResult getAdvicedResult() { if (evaluationResults.isEmpty()) { ValueMetaInterface adviced = new ValueMetaString("adviced"); adviced.setLength(maxLength); int nrNulls = 0; String min = null; String max = null; for (String string : values) { if (string != null) { if (min == null || min.compareTo(string) > 0) { min = string; } if (max == null || max.compareTo(string) < 0) { max = string; } } else { nrNulls++; } } StringEvaluationResult result = new StringEvaluationResult(adviced); result.setNrNull(nrNulls); result.setMin(min); result.setMax(max); return result; } else { // If there are Numbers and Integers, pick the integers... // if (containsInteger() && containsNumber()) { for (Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator .hasNext();) { StringEvaluationResult result = iterator.next(); if (maxPrecision == 0 && result.getConversionMeta().isNumber()) { // no precision, don't bother with a number iterator.remove(); } else if (maxPrecision > 0 && result.getConversionMeta().isInteger()) { // precision is needed, can't use integer iterator.remove(); } } } // If there are Dates and Integers, pick the dates... // if (containsInteger() && containsDate()) { for (Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator .hasNext();) { StringEvaluationResult result = iterator.next(); if (result.getConversionMeta().isInteger()) { iterator.remove(); } } } Comparator<StringEvaluationResult> compare = null; if (containsDate()) { // want the longest format for dates compare = new Comparator<StringEvaluationResult>() { @Override public int compare(StringEvaluationResult r1, StringEvaluationResult r2) { Integer length1 = r1.getConversionMeta().getConversionMask() == null ? 0 : r1.getConversionMeta().getConversionMask().length(); Integer length2 = r2.getConversionMeta().getConversionMask() == null ? 0 : r2.getConversionMeta().getConversionMask().length(); return length2.compareTo(length1); } }; } else { // want the shortest format mask for numerics & integers compare = new Comparator<StringEvaluationResult>() { @Override public int compare(StringEvaluationResult r1, StringEvaluationResult r2) { Integer length1 = r1.getConversionMeta().getConversionMask() == null ? 0 : r1.getConversionMeta().getConversionMask().length(); Integer length2 = r2.getConversionMeta().getConversionMask() == null ? 0 : r2.getConversionMeta().getConversionMask().length(); return length1.compareTo(length2); } }; } Collections.sort(evaluationResults, compare); StringEvaluationResult result = evaluationResults.get(0); ValueMetaInterface conversionMeta = result.getConversionMeta(); if (conversionMeta.isNumber() && conversionMeta.getCurrencySymbol() == null) { conversionMeta.setPrecision(maxPrecision); if (maxPrecision > 0 && maxLength > 0) { conversionMeta.setLength(maxLength); } } return result; } } public String[] getDateFormats() { return dateFormats; } public String[] getNumberFormats() { return numberFormats; } private void populateConversionMetaList() { int[] trimTypes; if (tryTrimming) { trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, ValueMetaInterface.TRIM_TYPE_BOTH, }; } else { trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, }; } for (int trimType : trimTypes) { for (String format : getDateFormats()) { ValueMetaInterface conversionMeta = new ValueMetaDate("date"); conversionMeta.setConversionMask(format); conversionMeta.setTrimType(trimType); conversionMeta.setDateFormatLenient(false); evaluationResults.add(new StringEvaluationResult(conversionMeta)); } EvalResultBuilder numberUsBuilder = new EvalResultBuilder("number-us", ValueMetaInterface.TYPE_NUMBER, 15, trimType, ".", ","); EvalResultBuilder numberEuBuilder = new EvalResultBuilder("number-eu", ValueMetaInterface.TYPE_NUMBER, 15, trimType, ",", "."); for (String format : getNumberFormats()) { if (format.equals("#") || format.equals("0")) { // skip the integer ones. we'll get those later continue; } int precision = determinePrecision(format); evaluationResults.add(numberUsBuilder.format(format, precision).build()); evaluationResults.add(numberEuBuilder.format(format, precision).build()); } // Try the locale's Currency DecimalFormat currencyFormat = ((DecimalFormat) NumberFormat.getCurrencyInstance()); ValueMetaInterface conversionMeta = new ValueMetaNumber("number-currency"); // replace the universal currency symbol with the locale's currency symbol for user recognition String currencyMask = currencyFormat.toLocalizedPattern().replace("\u00A4", currencyFormat.getCurrency().getSymbol()); conversionMeta.setConversionMask(currencyMask); conversionMeta.setTrimType(trimType); conversionMeta.setDecimalSymbol( String.valueOf(currencyFormat.getDecimalFormatSymbols().getDecimalSeparator())); conversionMeta.setGroupingSymbol( String.valueOf(currencyFormat.getDecimalFormatSymbols().getGroupingSeparator())); conversionMeta.setCurrencySymbol(currencyFormat.getCurrency().getSymbol()); conversionMeta.setLength(15); int currencyPrecision = currencyFormat.getCurrency().getDefaultFractionDigits(); conversionMeta.setPrecision(currencyPrecision); evaluationResults.add(new StringEvaluationResult(conversionMeta)); // add same mask w/o currency symbol String currencyMaskAsNumeric = currencyMask .replaceAll(Pattern.quote(currencyFormat.getCurrency().getSymbol()), ""); evaluationResults.add(numberUsBuilder.format(currencyMaskAsNumeric, currencyPrecision).build()); evaluationResults.add(numberEuBuilder.format(currencyMaskAsNumeric, currencyPrecision).build()); // Integer // conversionMeta = new ValueMetaInteger("integer"); conversionMeta.setConversionMask("#"); conversionMeta.setLength(15); evaluationResults.add(new StringEvaluationResult(conversionMeta)); conversionMeta = new ValueMetaInteger("integer"); conversionMeta.setConversionMask(" #"); conversionMeta.setLength(15); evaluationResults.add(new StringEvaluationResult(conversionMeta)); // Add support for left zero padded integers // for (int i = 1; i <= 15; i++) { String mask = " "; for (int x = 0; x < i; x++) { mask += "0"; } mask += ";-"; for (int x = 0; x < i; x++) { mask += "0"; } conversionMeta = new ValueMetaInteger("integer-zero-padded-" + i); conversionMeta.setConversionMask(mask); conversionMeta.setLength(i); evaluationResults.add(new StringEvaluationResult(conversionMeta)); } // Boolean // conversionMeta = new ValueMetaBoolean("boolean"); evaluationResults.add(new StringEvaluationResult(conversionMeta)); } } protected static int determinePrecision(String numericFormat) { if (numericFormat != null) { char decimalSymbol = ((DecimalFormat) NumberFormat.getInstance()).getDecimalFormatSymbols() .getDecimalSeparator(); int loc = numericFormat.lastIndexOf(decimalSymbol); if (loc >= 0 && loc < numericFormat.length()) { Matcher m = PRECISION_PATTERN.matcher(numericFormat.substring(loc + 1)); int nonDigitLoc = numericFormat.length(); if (m.find()) { nonDigitLoc = loc + 1 + m.start(); } return numericFormat.substring(loc + 1, nonDigitLoc).length(); } else { return 0; } } else { return 0; } } /** * @return The distinct set of string values */ public Set<String> getValues() { return values; } /** * PDI-7736: Only list of successful evaluations returned. * * @return The list of string evaluation results */ public List<StringEvaluationResult> getStringEvaluationResults() { List<StringEvaluationResult> result = new ArrayList<>(); for (StringEvaluationResult ev : evaluationResults) { if (ev.getNrSuccesses() > 0) { result.add(ev); } } return result; } /** * @return the number of values analyzed */ public int getCount() { return count; } /** * @return The maximum string length encountered */ public int getMaxLength() { return maxLength; } private static class EvalResultBuilder { private final String name; private final int type; private final int length; private final int trimType; private final String decimalSymbol; private final String groupingSymbol; private String format; private int precision; public StringEvaluationResult build() { ValueMetaInterface meta = new ValueMeta(name, type); meta.setConversionMask(format); meta.setTrimType(trimType); meta.setDecimalSymbol(decimalSymbol); meta.setGroupingSymbol(groupingSymbol); meta.setLength(length); meta.setPrecision(precision); return new StringEvaluationResult(meta); } public EvalResultBuilder(String name, int type, int length, int trimType, String decimalSymbol, String groupingSymbol) { this.name = name; this.type = type; this.length = length; this.trimType = trimType; this.decimalSymbol = decimalSymbol; this.groupingSymbol = groupingSymbol; } public EvalResultBuilder format(String format, int precision) { this.format = format; this.precision = precision; return this; } } }