org.kew.rmf.core.lucene.LuceneDataLoader.java Source code

Introduction

Here is the source code for org.kew.rmf.core.lucene.LuceneDataLoader.java
Source

/*
 * Reconciliation and Matching Framework
 * Copyright  2014 Royal Botanic Gardens, Kew
 *
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.kew.rmf.core.lucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.kew.rmf.core.DataLoader;
import org.kew.rmf.core.DatabaseRecordSource;
import org.kew.rmf.core.configuration.Configuration;
import org.kew.rmf.core.configuration.Property;
import org.kew.rmf.core.exception.DataLoadException;
import org.kew.rmf.transformers.Transformer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.supercsv.io.CsvMapReader;
import org.supercsv.prefs.CsvPreference;

/**
 * This implementation reads a file and stores its content in a Lucene index.
 * The rules for this are defined in the corresponding Configuration.
 */
public class LuceneDataLoader implements DataLoader {

    /**
     * RAM buffer to use for loading data into Lucene.
     */
    private static final double RAM_BUFFER_SIZE = 64;

    private org.apache.lucene.util.Version luceneVersion;

    /**
     * The IndexWriter is set in {@link #openIndex()}, and closed at the end of {@link #load()}.
     */
    private IndexWriter indexWriter;

    private FSDirectory directory;

    private Analyzer luceneAnalyzer;

    private Configuration config;
    private String configName;

    private static Logger logger = LoggerFactory.getLogger(LuceneDataLoader.class);

    /**
     * Load the authority data into Lucene.
     */
    @Override
    public void load() throws DataLoadException {
        try {
            indexWriter = openIndex();

            if (getConfig().isReuseIndex()) {
                logger.warn("{}: Reuse index not yet implemented, rereading", configName);
            }

            /*
             * In case no file is specified we (assuming a de-duplication task) use the
             * queryFile also for index creation.
             *
             * This is why we copy over the query-related properties to the
             * authority-related ones
             */
            if (config.getAuthorityRecords() == null) {
                if (config.getAuthorityFile() == null) {
                    for (Property p : config.getProperties()) {
                        p.setAuthorityTransformers(p.getQueryTransformers());
                        p.setAddOriginalAuthorityValue(p.isAddOriginalQueryValue());
                        p.setAddTransformedAuthorityValue(p.isAddTransformedQueryValue());
                        p.setAuthorityColumnName(p.getQueryColumnName());
                    }
                    config.setAuthorityFile(config.getQueryFile());
                    config.setAuthorityFileEncoding(config.getQueryFileEncoding());
                    config.setAuthorityFileDelimiter(config.getQueryFileDelimiter());
                }
                this.load(config.getAuthorityFile());
            } else {
                logger.info("{}: Starting data load", configName);
                this.load(config.getAuthorityRecords());
            }
        } catch (IOException e) {
            throw new DataLoadException(configName + ": Problem creating/opening Lucene index", e);
        } finally {
            // Close the index, no matter what happened.
            try {
                if (indexWriter != null) {
                    indexWriter.close();
                    logger.info("{}: IndexWriter {} closed", configName, indexWriter.getDirectory());
                }
            } catch (IOException e) {
                throw new DataLoadException(configName + ": Error closing Lucene index for " + configName, e);
            }
        }
    }

    /**
     * Opens an IndexWriter, reusing or wiping an existing index according to the configuration.
     */
    private IndexWriter openIndex() throws IOException {
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getLuceneVersion(), luceneAnalyzer);
        indexWriterConfig.setRAMBufferSizeMB(RAM_BUFFER_SIZE);

        //      if (getConfig().isReuseIndex()) {
        //         // Reuse the index if it exists, otherwise create a new one.
        //         logger.debug("{}: Reusing existing index, if it exists", configName);
        //         indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        //      }
        //      else {
        // Create a new index, overwriting any that already exists.
        logger.debug("{}: Overwriting existing index, if it exists", configName);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        //      }

        IndexWriter indexWriter;

        try {
            indexWriter = new IndexWriter(directory, indexWriterConfig);
        } catch (IOException e) {
            logger.warn("Exception while creating index, removing index directory and retrying", e);
            // Try deleting the index directory.
            File dir = directory.getDirectory();
            if (dir.isDirectory() && dir.listFiles() != null) {
                logger.warn("{}: Wiping existing index directory {}", configName, dir);
                FileUtils.deleteDirectory(dir);
            }
            indexWriter = new IndexWriter(directory, indexWriterConfig);
        }

        return indexWriter;
    }

    private void load(DatabaseRecordSource recordSource) throws DataLoadException {
        int i = 0;
        int errors = 0;

        ResultSet resultSet;
        try {
            resultSet = recordSource.getResultSet();
        } catch (SQLException e) {
            throw new DataLoadException(configName + ": Problem reading data from database " + recordSource, e);
        }

        try {
            // check whether the necessary column names are present in the ResultSet
            for (String headerName : this.config.getPropertyAuthorityColumnNames()) {
                try {
                    resultSet.findColumn(headerName);
                } catch (SQLException se) {
                    throw new DataLoadException(String.format(
                            "%s: Database result doesn't contain field name %s as defined in config.",
                            this.config.getName(), headerName));
                }
            }
            // same for the id-field
            try {
                resultSet.findColumn(Configuration.ID_FIELD_NAME);
            } catch (SQLException se) {
                throw new DataLoadException(
                        String.format("%s: Database result doesn't contain field name %s as defined in config.",
                                this.config.getName(), Configuration.ID_FIELD_NAME));
            }

            Document doc = null;
            while (resultSet.next()) {
                // Index record
                try {
                    doc = this.indexRecord(resultSet);
                } catch (Exception e) {
                    errors++;
                    String message = configName + ": Problem indexing record " + i + ", " + resultSet;
                    if (errors < config.getMaximumLoadErrors()) {
                        logger.warn(message, e);
                    } else {
                        throw new DataLoadException(message, e);
                    }
                }

                // Log progress
                if (i++ % this.config.getLoadReportFrequency() == 0) {
                    logger.info("{}: Indexed {} documents", configName, i);
                    logger.debug("{}: Most recent indexed document was {}", configName, doc);
                }
            }
            recordSource.close();

            indexWriter.commit();
        } catch (Exception e) {
            throw new DataLoadException(configName + ": Error " + e.getMessage() + " loading records at row " + i,
                    e);
        }
        logger.info("{}: Indexed {} records", configName, i);
    }

    private void load(File file) throws DataLoadException {
        int i = 0;
        int errors = 0;

        // TODO: either make quote characters and line break characters configurable or simplify even more?
        CsvPreference customCsvPref = new CsvPreference.Builder('"',
                this.config.getAuthorityFileDelimiter().charAt(0), "\n").build();

        try (CsvMapReader mr = new CsvMapReader(new InputStreamReader(new FileInputStream(file), "UTF-8"),
                customCsvPref)) {

            final String[] header = mr.getHeader(true);
            // check whether the header column names fit to the ones specified in the configuration
            List<String> headerList = Arrays.asList(header);
            for (String name : this.config.getPropertyAuthorityColumnNames()) {
                if (!headerList.contains(name))
                    throw new DataLoadException(
                            String.format("%s: Header doesn't contain field name < %s > as defined in config.",
                                    this.config.getAuthorityFile().getPath(), name));
            }
            // same for the id-field
            String idFieldName = Configuration.ID_FIELD_NAME;
            if (!headerList.contains(idFieldName))
                throw new DataLoadException(String.format("%s: Id field name not found in header, should be %s!",
                        this.config.getAuthorityFile().getPath(), idFieldName));
            Map<String, String> record;
            record = mr.read(header);

            Document doc = null;
            while (record != null) {
                // Index record
                try {
                    doc = this.indexRecord(record);
                } catch (Exception e) {
                    errors++;
                    String message = configName + ": Problem indexing record " + i + ", " + record;
                    if (errors < config.getMaximumLoadErrors()) {
                        logger.warn(message, e);
                    } else {
                        throw new DataLoadException(message, e);
                    }
                }

                // Log process
                if (i++ % this.config.getLoadReportFrequency() == 0) {
                    logger.info("{}: Indexed {} documents", configName, i);
                    logger.debug("{}: Most recent indexed document was {}", configName, doc);
                }

                // Read next record from CSV/datasource
                try {
                    record = mr.read(header);
                } catch (Exception e) {
                    errors++;
                    String message = configName + ": Problem reading record " + i + " " + mr.getUntokenizedRow()
                            + "";
                    if (errors < config.getMaximumLoadErrors()) {
                        logger.warn(message, e);
                    } else {
                        throw new DataLoadException(message, e);
                    }
                }
            }

            indexWriter.commit();
        } catch (Exception e) {
            throw new DataLoadException(configName + ": Error " + e.getMessage() + " loading records at line " + i,
                    e);
        }
        logger.info("{}: Indexed {} records", configName, i);
    }

    /**
     * Adds a single record to the Lucene index
     * @param record
     * @throws Exception
     */
    private Document indexRecord(ResultSet record) throws Exception {
        Document doc = new Document();
        String idFieldName = Configuration.ID_FIELD_NAME;
        logger.trace("rawRecord: {}", record.toString());

        doc.add(new StringField(idFieldName, record.getString(idFieldName), Field.Store.YES));

        // The remainder of the columns are added as specified in the properties
        for (Property p : this.config.getProperties()) {
            String authorityName = p.getAuthorityColumnName() + Configuration.TRANSFORMED_SUFFIX;
            String value = record.getString(p.getAuthorityColumnName());
            if (value == null)
                value = "";

            // Index the value in its original state, pre transformation
            Field f = new TextField(p.getAuthorityColumnName(), value, Field.Store.YES);
            doc.add(f);

            // then transform the value if necessary
            for (Transformer t : p.getAuthorityTransformers()) {
                value = t.transform(value);
            }
            // and add this one to the index
            Field fTransformed = new TextField(authorityName, value, Field.Store.YES);
            doc.add(fTransformed);

            // For some fields (those which will be passed into a fuzzy matcher like Levenshtein), we index the length
            if (p.isIndexLength()) {
                Field fLength = new StringField(authorityName + Configuration.LENGTH_SUFFIX,
                        String.format("%02d", value.length()), Field.Store.YES);
                doc.add(fLength);
            }

            if (p.isIndexInitial() & StringUtils.isNotBlank(value)) {
                Field finit = new TextField(authorityName + Configuration.INITIAL_SUFFIX, value.substring(0, 1),
                        Field.Store.YES);
                doc.add(finit);
            }
        }

        logger.trace("Document to be indexed: {}", doc.toString());
        this.indexWriter.addDocument(doc);
        return doc;
    }

    private Document indexRecord(Map<String, String> record) throws Exception {
        Document doc = new Document();
        String idFieldName = Configuration.ID_FIELD_NAME;
        logger.trace("rawRecord: {}", record.toString());
        doc.add(new StringField(idFieldName, record.get(idFieldName), Field.Store.YES));
        // The remainder of the columns are added as specified in the properties
        for (Property p : this.config.getProperties()) {
            String authorityName = p.getAuthorityColumnName() + Configuration.TRANSFORMED_SUFFIX;
            String value = record.get(p.getAuthorityColumnName());
            // super-csv treats blank as null, we don't for now
            value = (value != null) ? value : "";

            // Index the value in its original state, pre transformation..
            Field f = new TextField(p.getAuthorityColumnName(), value, Field.Store.YES);
            doc.add(f);

            // ..*then* transform the value if necessary..
            for (Transformer t : p.getAuthorityTransformers()) {
                value = t.transform(value);
            }
            //.. and add this one to the index
            Field f1 = new TextField(authorityName, value, Field.Store.YES);
            doc.add(f1);

            // For some fields (those which will be passed into a fuzzy matcher like Levenshtein), we index the length
            if (p.isIndexLength()) {
                int length = 0;
                if (value != null)
                    length = value.length();
                Field fl = new StringField(authorityName + Configuration.LENGTH_SUFFIX,
                        String.format("%02d", length), Field.Store.YES);
                doc.add(fl);
            }
            if (p.isIndexInitial() & StringUtils.isNotBlank(value)) {
                Field finit = new TextField(authorityName + Configuration.INITIAL_SUFFIX, value.substring(0, 1),
                        Field.Store.YES);
                doc.add(finit);
            }
        }
        logger.trace("Document to be indexed: {}", doc.toString());
        this.indexWriter.addDocument(doc);
        return doc;
    }

    /*  Getters and setters  */
    public org.apache.lucene.util.Version getLuceneVersion() {
        return luceneVersion;
    }

    public void setLuceneVersion(org.apache.lucene.util.Version luceneVersion) {
        this.luceneVersion = luceneVersion;
    }

    public FSDirectory getDirectory() {
        return directory;
    }

    public void setDirectory(FSDirectory directory) {
        this.directory = directory;
    }

    public Analyzer getLuceneAnalyzer() {
        return luceneAnalyzer;
    }

    public void setLuceneAnalyzer(Analyzer luceneAnalyzer) {
        this.luceneAnalyzer = luceneAnalyzer;
    }

    public Configuration getConfig() {
        return this.config;
    }

    @Override
    public void setConfig(Configuration config) {
        this.config = config;
        this.configName = config.getName();
    }
}