Java tutorial
/* * Reconciliation and Matching Framework * Copyright 2014 Royal Botanic Gardens, Kew * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.kew.rmf.core.lucene; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import javax.script.ScriptException; import org.apache.commons.lang.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.kew.rmf.core.DataHandler; import org.kew.rmf.core.configuration.Configuration; import org.kew.rmf.core.configuration.MatchConfiguration; import org.kew.rmf.core.configuration.Property; import org.kew.rmf.core.exception.DataLoadException; import org.kew.rmf.core.exception.MatchExecutionException; import org.kew.rmf.core.exception.TooManyMatchesException; import org.kew.rmf.matchers.MatchException; import org.kew.rmf.reporters.LuceneReporter; import org.kew.rmf.reporters.Piper; import org.kew.rmf.transformers.TransformationException; import org.kew.rmf.transformers.Transformer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.supercsv.io.CsvMapReader; import org.supercsv.prefs.CsvPreference; /** * Performs the actual match against the Lucene index. * * {@link #getMatches(Map, int)} returns a list of matches. */ public class LuceneMatcher extends LuceneHandler<MatchConfiguration> implements DataHandler<MatchConfiguration> { private static final Logger logger = LoggerFactory.getLogger(LuceneMatcher.class); protected MatchConfiguration matchConfig; @Override // from DataHandler public void loadData() throws DataLoadException { this.dataLoader.setConfig(this.getConfig()); this.dataLoader.load(); } /** * Performs a match against the Lucene index, and returns a list of matches. * @param record The record needing to be matched * @return A list of matched records * @throws TooManyMatchesException Thrown if more than configured maximum number of matches are found * @throws MatchExecutionException For other errors finding a match */ public List<Map<String, String>> getMatches(Map<String, String> record) throws TooManyMatchesException, MatchExecutionException { // pipe everything through to the output where an existing filter evaluates to false; try { if (!StringUtils.isBlank(config.getRecordFilter()) && !jsEnv.evalFilter(config.getRecordFilter(), record)) { logger.debug("All records excluded by record filter"); return new ArrayList<>(); } } catch (ScriptException e) { throw new MatchExecutionException("Error evaluating recordFilter on record " + record, e); } // transform fields where required for (Property prop : config.getProperties()) { String fName = prop.getQueryColumnName(); String fValue = record.get(fName); // transform the field-value.. fValue = fValue == null ? "" : fValue; // super-csv treats blank as null, we don't for now for (Transformer t : prop.getQueryTransformers()) { try { fValue = t.transform(fValue); } catch (TransformationException e) { throw new MatchExecutionException("Error evaluating transformer " + t + " on record " + record, e); } } // ..and put it into the record record.put(fName + Configuration.TRANSFORMED_SUFFIX, fValue); } String fromId = record.get(Configuration.ID_FIELD_NAME); // Use the properties to select a set of documents which may contain matches String querystr = LuceneUtils.buildQuery(config.getProperties(), record, false); // If the query for some reasons results being empty we pipe the record directly through to the output // TODO: create a log-file that stores critical log messages? if (querystr.equals("")) { logger.warn("Empty query for record {}", record); return new ArrayList<>(); } // Perform the match TopDocs td; try { td = queryLucene(querystr, this.getIndexSearcher(), config.getMaxSearchResults()); if (td.totalHits >= config.getMaxSearchResults()) { logger.info("Error matching {}", "query"); throw new TooManyMatchesException(String.format( "Number of max search results exceeded for record %s! You should either tweak your config to bring back less possible results making better use of the \"useInSelect\" switch (recommended) or raise the \"maxSearchResults\" number.", record)); } logger.debug("Found {} possibles to assess against {}", td.totalHits, fromId); } catch (ParseException | IOException e) { throw new MatchExecutionException("Error querying Lucene on query " + record, e); } List<Map<String, String>> matches = new ArrayList<>(); for (ScoreDoc sd : td.scoreDocs) { try { Document toDoc = getFromLucene(sd.doc); if (LuceneUtils.recordsMatch(record, toDoc, config.getProperties())) { Map<String, String> toDocAsMap = LuceneUtils.doc2Map(toDoc); matches.add(toDocAsMap); logger.info("Match is {}", toDocAsMap); } } catch (MatchException e) { throw new MatchExecutionException("Error running matcher for " + record, e); } catch (IOException e) { throw new MatchExecutionException("Error retrieving match result from Lucene " + sd.doc, e); } } sortMatches(matches); return matches; } public void sortMatches(List<Map<String, String>> matches) { final String sortOn = config.getSortFieldName(); try { Collections.sort(matches, Collections.reverseOrder(new Comparator<Map<String, String>>() { @Override public int compare(final Map<String, String> m1, final Map<String, String> m2) { return Integer.valueOf(m1.get(sortOn)).compareTo(Integer.valueOf(m2.get(sortOn))); } })); } catch (NumberFormatException e) { // if the String can't be converted to an integer we do String comparison Collections.sort(matches, Collections.reverseOrder(new Comparator<Map<String, String>>() { @Override public int compare(final Map<String, String> m1, final Map<String, String> m2) { return m1.get(sortOn).compareTo(m2.get(sortOn)); } })); } } /** * Run the whole matching task. * * The iterative flow is: * - load the data (== write the lucene index) * - iterate over the query data file * - for each record, look for matches in the index * - for each record, report into new fields of this record about matches via reporters * * The main difference to a deduplication task as defined by {@link LuceneDeduplicator} * is that we use two different datasets, one to create the authority index, the other one as * query file (where we iterate over each record to look up possible matches). */ @Override public void run() throws Exception { this.loadData(); // writes the index according to the configuration // TODO: either make quote characters and line break characters configurable or simplify even more? CsvPreference csvPref = new CsvPreference.Builder('"', this.getConfig().getQueryFileDelimiter().charAt(0), "\n").build(); int i = 0; try (MatchConfiguration config = this.getConfig(); IndexReader indexReader = this.getIndexReader(); CsvMapReader mr = new CsvMapReader( new InputStreamReader(new FileInputStream(this.getConfig().getQueryFile()), this.getConfig().getQueryFileEncoding()), csvPref)) { this.prepareEnvs(); // loop over the queryFile int numMatches = 0; final String[] header = mr.getHeader(true); // check whether the header column names fit to the ones specified in the configuration List<String> headerList = Arrays.asList(header); for (String name : this.config.getPropertyQueryColumnNames()) { if (!headerList.contains(name)) throw new Exception( String.format("%s: Header doesn't contain field name < %s > as defined in config.", this.config.getQueryFile().getPath(), name)); } // same for the id-field String idFieldName = Configuration.ID_FIELD_NAME; if (!headerList.contains(idFieldName)) throw new Exception(String.format("%s: Id field name not found in header, should be %s!", this.config.getQueryFile().getPath(), idFieldName)); Map<String, String> record; while ((record = mr.read(header)) != null) { List<Map<String, String>> matches = getMatches(record); if (matches == null) { for (Piper piper : config.getPipers()) piper.pipe(record); continue; } numMatches += matches.size(); if (i++ % config.getAssessReportFrequency() == 0) logger.info("Assessed " + i + " records, found " + numMatches + " matches"); // call each reporter that has a say; all they get is a complete list of duplicates for this record. for (LuceneReporter reporter : config.getReporters()) { // TODO: make idFieldName configurable, but not on reporter level reporter.report(record, matches); } } logger.info("Assessed " + i + " records, found " + numMatches + " matches"); } } }