Java tutorial
/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.fuzzymatch; import java.util.Iterator; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.commons.lang.StringUtils; import org.pentaho.di.core.Const; import org.pentaho.di.core.RowSet; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.util.Utils; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import com.wcohen.ss.Jaro; import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.NeedlemanWunsch; /** * Performs a fuzzy match for each main stream field row An approximative match is done in a lookup stream * * @author Samatar * @since 03-mars-2008 */ public class FuzzyMatch extends BaseStep implements StepInterface { private static Class<?> PKG = FuzzyMatchMeta.class; // for i18n purposes, needed by Translator2!! private FuzzyMatchMeta meta; private FuzzyMatchData data; public FuzzyMatch(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); } private boolean readLookupValues() throws KettleException { data.infoStream = meta.getStepIOMeta().getInfoStreams().get(0); if (data.infoStream.getStepMeta() == null) { logError(BaseMessages.getString(PKG, "FuzzyMatch.Log.NoLookupStepSpecified")); return false; } if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.ReadingFromStream") + data.infoStream.getStepname() + "]"); } boolean firstRun = true; // Which row set do we read from? // RowSet rowSet = findInputRowSet(data.infoStream.getStepname()); Object[] rowData = getRowFrom(rowSet); // rows are originating from "lookup_from" while (rowData != null) { if (firstRun) { data.infoMeta = rowSet.getRowMeta().clone(); // Check lookup field int indexOfLookupField = data.infoMeta.indexOfValue(environmentSubstitute(meta.getLookupField())); if (indexOfLookupField < 0) { // The field is unreachable ! throw new KettleException(BaseMessages.getString(PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getLookupField())); } data.infoCache = new RowMeta(); ValueMetaInterface keyValueMeta = data.infoMeta.getValueMeta(indexOfLookupField); keyValueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL); data.infoCache.addValueMeta(keyValueMeta); // Add key data.indexOfCachedFields[0] = indexOfLookupField; // Check additional fields if (data.addAdditionalFields) { ValueMetaInterface additionalFieldValueMeta; for (int i = 0; i < meta.getValue().length; i++) { int fi = i + 1; data.indexOfCachedFields[fi] = data.infoMeta.indexOfValue(meta.getValue()[i]); if (data.indexOfCachedFields[fi] < 0) { // The field is unreachable ! throw new KettleException(BaseMessages.getString(PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getValue()[i])); } additionalFieldValueMeta = data.infoMeta.getValueMeta(data.indexOfCachedFields[fi]); additionalFieldValueMeta.setStorageType(ValueMetaInterface.STORAGE_TYPE_NORMAL); data.infoCache.addValueMeta(additionalFieldValueMeta); } data.nrCachedFields += meta.getValue().length; } } if (log.isRowLevel()) { logRowlevel(BaseMessages.getString(PKG, "FuzzyMatch.Log.ReadLookupRow") + rowSet.getRowMeta().getString(rowData)); } // Look up the keys in the source rows // and store values in cache Object[] storeData = new Object[data.nrCachedFields]; // Add key field if (rowData[data.indexOfCachedFields[0]] == null) { storeData[0] = ""; } else { ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta() .getValueMeta(data.indexOfCachedFields[0]); if (fromStreamRowMeta.isStorageBinaryString()) { storeData[0] = fromStreamRowMeta .convertToNormalStorageType(rowData[data.indexOfCachedFields[0]]); } else { storeData[0] = rowData[data.indexOfCachedFields[0]]; } } // Add additional fields? for (int i = 1; i < data.nrCachedFields; i++) { ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta() .getValueMeta(data.indexOfCachedFields[i]); if (fromStreamRowMeta.isStorageBinaryString()) { storeData[i] = fromStreamRowMeta .convertToNormalStorageType(rowData[data.indexOfCachedFields[i]]); } else { storeData[i] = rowData[data.indexOfCachedFields[0]]; } } if (isDebug()) { logDebug(BaseMessages.getString(PKG, "FuzzyMatch.Log.AddingValueToCache", data.infoCache.getString(storeData))); } addToCache(storeData); rowData = getRowFrom(rowSet); if (firstRun) { firstRun = false; } } return true; } private Object[] lookupValues(RowMetaInterface rowMeta, Object[] row) throws KettleException { if (first) { first = false; data.outputRowMeta = getInputRowMeta().clone(); meta.getFields(data.outputRowMeta, getStepname(), new RowMetaInterface[] { data.infoMeta }, null, this, repository, metaStore); // Check lookup field data.indexOfMainField = getInputRowMeta() .indexOfValue(environmentSubstitute(meta.getMainStreamField())); if (data.indexOfMainField < 0) { // The field is unreachable ! throw new KettleException(BaseMessages.getString(PKG, "FuzzyMatch.Exception.CouldnotFindMainField", meta.getMainStreamField())); } } Object[] add = null; try { add = getFromCache(row); } catch (Exception e) { throw new KettleStepException(e); } return RowDataUtil.addRowData(row, rowMeta.size(), add); } private void addToCache(Object[] value) throws KettleException { try { data.look.add(value); } catch (java.lang.OutOfMemoryError o) { // exception out of memory throw new KettleException(BaseMessages.getString(PKG, "FuzzyMatch.Error.JavaHeap", o.toString())); } } private Object[] getFromCache(Object[] keyRow) throws KettleValueException { if (isDebug()) { logDebug(BaseMessages.getString(PKG, "FuzzyMatch.Log.ReadingMainStreamRow", getInputRowMeta().getString(keyRow))); } Object[] retval = null; switch (meta.getAlgorithmType()) { case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: retval = doDistance(keyRow); break; case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE: case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE: case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX: case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX: retval = doPhonetic(keyRow); break; case FuzzyMatchMeta.OPERATION_TYPE_JARO: case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY: retval = doSimilarity(keyRow); break; default: break; } return retval; } private Object[] doDistance(Object[] row) throws KettleValueException { // Reserve room Object[] rowData = buildEmptyRow(); Iterator<Object[]> it = data.look.iterator(); long distance = -1; // Object o=row[data.indexOfMainField]; String lookupvalue = getInputRowMeta().getString(row, data.indexOfMainField); while (it.hasNext()) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; int cdistance = -1; String usecacheValue = cacheValue; String uselookupvalue = lookupvalue; if (!meta.isCaseSensitive()) { usecacheValue = cacheValue.toLowerCase(); uselookupvalue = lookupvalue.toLowerCase(); } switch (meta.getAlgorithmType()) { case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: cdistance = Utils.getDamerauLevenshteinDistance(usecacheValue, uselookupvalue); break; case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: cdistance = Math.abs((int) new NeedlemanWunsch().score(usecacheValue, uselookupvalue)); break; default: cdistance = StringUtils.getLevenshteinDistance(usecacheValue, uselookupvalue); break; } if (data.minimalDistance <= cdistance && cdistance <= data.maximalDistance) { if (meta.isGetCloserValue()) { if (cdistance < distance || distance == -1) { // Get closer value // minimal distance distance = cdistance; int index = 0; rowData[index++] = cacheValue; // Add metric value? if (data.addValueFieldName) { rowData[index++] = distance; } // Add additional return values? if (data.addAdditionalFields) { for (int i = 0; i < meta.getValue().length; i++) { int nr = i + 1; int nf = i + index; rowData[nf] = cachedData[nr]; } } } } else { // get all values separated by values separator if (rowData[0] == null) { rowData[0] = cacheValue; } else { rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue; } } } } return rowData; } private Object[] doPhonetic(Object[] row) { // Reserve room Object[] rowData = buildEmptyRow(); Iterator<Object[]> it = data.look.iterator(); Object o = row[data.indexOfMainField]; String lookupvalue = (String) o; String lookupValueMF = getEncodedMF(lookupvalue, meta.getAlgorithmType()); while (it.hasNext()) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; String cacheValueMF = getEncodedMF(cacheValue, meta.getAlgorithmType()); if (lookupValueMF.equals(cacheValueMF)) { // Add match value int index = 0; rowData[index++] = cacheValue; // Add metric value? if (data.addValueFieldName) { rowData[index++] = cacheValueMF; } // Add additional return values? if (data.addAdditionalFields) { for (int i = 0; i < meta.getValue().length; i++) { int nf = i + index; int nr = i + 1; rowData[nf] = cachedData[nr]; } } } } return rowData; } private String getEncodedMF(String value, Integer algorithmType) { String encodedValueMF = ""; switch (algorithmType) { case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE: encodedValueMF = (new Metaphone()).metaphone(value); break; case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE: encodedValueMF = ((new DoubleMetaphone()).doubleMetaphone(value)); break; case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX: encodedValueMF = (new Soundex()).encode(value); break; case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX: encodedValueMF = (new RefinedSoundex()).encode(value); break; default: break; } return encodedValueMF; } private Object[] doSimilarity(Object[] row) { // Reserve room Object[] rowData = buildEmptyRow(); // prepare to read from cache ... Iterator<Object[]> it = data.look.iterator(); double similarity = 0; // get current value from main stream Object o = row[data.indexOfMainField]; String lookupvalue = o == null ? "" : (String) o; while (it.hasNext()) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; double csimilarity = new Double(0); switch (meta.getAlgorithmType()) { case FuzzyMatchMeta.OPERATION_TYPE_JARO: csimilarity = new Jaro().score(cacheValue, lookupvalue); break; case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: csimilarity = new JaroWinkler().score(cacheValue, lookupvalue); break; default: // Letters pair similarity csimilarity = LetterPairSimilarity.getSimiliarity(cacheValue, lookupvalue); break; } if (data.minimalSimilarity <= csimilarity && csimilarity <= data.maximalSimilarity) { if (meta.isGetCloserValue()) { if (csimilarity > similarity || (csimilarity == 0 && cacheValue.equals(lookupvalue))) { similarity = csimilarity; // Update match value int index = 0; rowData[index++] = cacheValue; // Add metric value? if (data.addValueFieldName) { rowData[index++] = new Double(similarity); } // Add additional return values? if (data.addAdditionalFields) { for (int i = 0; i < meta.getValue().length; i++) { int nf = i + index; int nr = i + 1; rowData[nf] = cachedData[nr]; } } } } else { // get all values separated by values separator if (rowData[0] == null) { rowData[0] = cacheValue; } else { rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue; } } } } return rowData; } /** * Build an empty row based on the meta-data... * * @return */ private Object[] buildEmptyRow() { Object[] rowData = RowDataUtil.allocateRowData(data.outputRowMeta.size()); return rowData; } public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; if (data.readLookupValues) { data.readLookupValues = false; // Read values from lookup step (look) if (!readLookupValues()) { logError(BaseMessages.getString(PKG, "FuzzyMatch.Log.UnableToReadDataFromLookupStream")); setErrors(1); stopAll(); return false; } if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.ReadValuesInMemory", data.look.size())); } } Object[] r = getRow(); // Get row from input rowset & set row busy! if (r == null) { // no more input to be expected... if (isDetailed()) { logDetailed( BaseMessages.getString(PKG, "FuzzyMatch.Log.StoppedProcessingWithEmpty", getLinesRead())); } setOutputDone(); return false; } try { // Do the actual lookup in the hastable. Object[] outputRow = lookupValues(getInputRowMeta(), r); if (outputRow == null) { setOutputDone(); // signal end to receiver(s) return false; } putRow(data.outputRowMeta, outputRow); // copy row to output rowset(s); if (checkFeedback(getLinesRead())) { if (log.isBasic()) { logBasic(BaseMessages.getString(PKG, "FuzzyMatch.Log.LineNumber") + getLinesRead()); } } } catch (KettleException e) { boolean sendToErrorRow = false; String errorMessage = null; if (getStepMeta().isDoingErrorHandling()) { sendToErrorRow = true; errorMessage = e.toString(); } else { logError(BaseMessages.getString(PKG, "FuzzyMatch.Log.ErrorInStepRunning") + e.getMessage()); setErrors(1); stopAll(); setOutputDone(); // signal end to receiver(s) return false; } if (sendToErrorRow) { // Simply add this row to the error row putError(getInputRowMeta(), r, 1, errorMessage, meta.getMainStreamField(), "FuzzyMatch001"); } } return true; } public boolean init(StepMetaInterface smi, StepDataInterface sdi) { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; if (super.init(smi, sdi)) { // Check lookup and main stream field if (Const.isEmpty(meta.getMainStreamField())) { logError(BaseMessages.getString(PKG, "FuzzyMatch.Error.MainStreamFieldMissing")); return false; } if (Const.isEmpty(meta.getLookupField())) { logError(BaseMessages.getString(PKG, "FuzzyMatch.Error.LookupStreamFieldMissing")); return false; } // Checks output fields String matchField = environmentSubstitute(meta.getOutputMatchField()); if (Const.isEmpty(matchField)) { logError(BaseMessages.getString(PKG, "FuzzyMatch.Error.OutputMatchFieldMissing")); return false; } // We need to add metrics (distance, similarity, ...) // only when the fieldname is provided // and user want to return the closer value data.addValueFieldName = (!Const.isEmpty(environmentSubstitute(meta.getOutputValueField())) && meta.isGetCloserValue()); // Set the number of fields to cache // default value is one int nrFields = 1; if (meta.getValue() != null && meta.getValue().length > 0) { if (meta.isGetCloserValue() || (meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE) || (meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX) || (meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX) || (meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_METAPHONE)) { // cache also additional fields data.addAdditionalFields = true; nrFields += meta.getValue().length; } } data.indexOfCachedFields = new int[nrFields]; switch (meta.getAlgorithmType()) { case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: data.minimalDistance = Const.toInt(environmentSubstitute(meta.getMinimalValue()), 0); if (isDetailed()) { logDetailed( BaseMessages.getString(PKG, "FuzzyMatch.Log.MinimalDistance", data.minimalDistance)); } data.maximalDistance = Const.toInt(environmentSubstitute(meta.getMaximalValue()), 5); if (isDetailed()) { logDetailed( BaseMessages.getString(PKG, "FuzzyMatch.Log.MaximalDistance", data.maximalDistance)); } if (!meta.isGetCloserValue()) { data.valueSeparator = environmentSubstitute(meta.getSeparator()); if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.Separator", data.valueSeparator)); } } break; case FuzzyMatchMeta.OPERATION_TYPE_JARO: case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY: data.minimalSimilarity = Const.toDouble(environmentSubstitute(meta.getMinimalValue()), 0); if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.MinimalSimilarity", data.minimalSimilarity)); } data.maximalSimilarity = Const.toDouble(environmentSubstitute(meta.getMaximalValue()), 1); if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.MaximalSimilarity", data.maximalSimilarity)); } if (!meta.isGetCloserValue()) { data.valueSeparator = environmentSubstitute(meta.getSeparator()); if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "FuzzyMatch.Log.Separator", data.valueSeparator)); } } break; default: break; } data.readLookupValues = true; return true; } return false; } public void dispose(StepMetaInterface smi, StepDataInterface sdi) { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; data.look.clear(); super.dispose(smi, sdi); } }