io.github.infolis.algorithm.Indexer.java Source code

Introduction

Here is the source code for io.github.infolis.algorithm.Indexer.java
Source

package io.github.infolis.algorithm;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
import io.github.infolis.InfolisConfig;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.infolink.luceneIndexing.CaseSensitiveStandardAnalyzer;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.entity.InfolisFile;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class for adding text files to a Lucene index.
 *
 * @author kata
 * @author kba
 */
public class Indexer extends BaseAlgorithm {
    //

    private final static String INDEX_DIR_PREFIX = "infolis-index-";

    public Indexer(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient,
            FileResolver inputFileResolver, FileResolver outputFileResolver) {
        super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
    }

    private Logger log = LoggerFactory.getLogger(Indexer.class);

    /*
     * kba: That was the old value of {@link IndexWriter.MaxFieldLength.LIMITED}
     */
    private static final int MAX_TOKEN_COUNT = 10000;

    public static Analyzer createAnalyzer() {
        return new LimitTokenCountAnalyzer(new CaseSensitiveStandardAnalyzer(), MAX_TOKEN_COUNT);
    }

    @Override
    public void execute() throws IOException {
        File indexDir;
        if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) {
            indexDir = new File(getExecution().getIndexDirectory());
        } else {
            indexDir = new File(
                    Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX)
                            .toString());
            FileUtils.forceDeleteOnExit(indexDir);
        }
        log.debug("Indexing to: " + indexDir.getAbsolutePath());
        getExecution().setOutputDirectory(indexDir.getAbsolutePath().toString());

        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, createAnalyzer());
        indexWriterConfig.setOpenMode(OpenMode.CREATE);
        FSDirectory fsIndexDir = FSDirectory.open(indexDir);

        List<InfolisFile> files = new ArrayList<>();
        for (String fileUri : getExecution().getInputFiles()) {
            try {
                files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri));
            } catch (Exception e) {
                error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage());
                getExecution().setStatus(ExecutionStatus.FAILED);
                persistExecution();
                return;
            }
        }

        Date start = new Date();
        log.debug("Starting to index");
        IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig);
        try {
            int counter = 0;
            for (InfolisFile file : files) {
                counter++;
                log.trace("Indexing file " + file);
                writer.addDocument(toLuceneDocument(getInputFileResolver(), file));
                updateProgress(counter, files.size());

            }
        } catch (FileNotFoundException fnfe) {
            // NOTE: at least on windows, some temporary files raise this
            // exception with an "access denied" message checking if the
            // file can be read doesn't help
            throw new RuntimeException("Could not write index entry: " + fnfe);
        } finally {
            log.debug("Merging all Lucene segments ...");
            writer.forceMerge(1);
            writer.close();
        }
        getExecution().setStatus(ExecutionStatus.FINISHED);
        fsIndexDir.close();
        log.debug(String.format("Indexing %s documents took %s ms", files.size(),
                new Date().getTime() - start.getTime()));
    }

    @Override
    public void validate() throws IllegalAlgorithmArgumentException {
        Execution exec = this.getExecution();
        if (null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) {
            throw new IllegalAlgorithmArgumentException(getClass(), "inputFiles", "missing or empty");
        }
    }

    /**
     * Files a lucene document. Documents are created as follows:
     * <ol>
     * <li>The path of the file is added as a field named "path". The field is
     * indexed (i.e. searchable), but not tokenized into words.</li>
     * <li>The last modified date of the file is added as a field named
     * "modified". The field is indexed (i.e. searchable), not tokenized into
     * words.</li>
     * <li>The contents of the file are added to a field named "contents". A
     * reader is specified so that the text of the file is tokenized and
     * indexed, but not stored. Note that FileReader expects the file to be in
     * the system's default encoding. If that's not the case searching for
     * special characters will fail.</li>
     * <li>Content (text files) is saved in the index along with position and
     * offset information.</li>
     * </ol>
     *
     * @param f   a txt-file to be included in the lucene index
     * @return a   lucene document
     * @throws IOException
     */
    public static Document toLuceneDocument(FileResolver fileResolver, InfolisFile f) throws IOException {
        //use code below to process pdfs instead of text (requires pdfBox)
        /*FileInputStream fi = new FileInputStream(new File(f.getPath()));
            PDFParser parser = new PDFParser(fi);
            parser.parse();
            COSDocument cd = parser.getDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            String text = stripper.getText(new PDDocument(cd));  */
        InputStreamReader isr = new InputStreamReader(fileResolver.openInputStream(f), "UTF8");
        BufferedReader reader = new BufferedReader(isr);
        StringBuffer contents = new StringBuffer();
        String text = null;
        while ((text = reader.readLine()) != null) {
            contents.append(text).append(System.getProperty("line.separator"));
        }
        reader.close();
        isr.close();
        text = new String(contents);

        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getUri(), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("fileName", f.getFileName(), Field.Store.YES, Field.Index.ANALYZED));

        // Add the last modified date of the file a field named "modified".  Use
        // a field that is indexed (i.e. searchable), but don't tokenize the field
        // into words.
        // TODO kba: Add modified to InfolisFile
        //      doc.add( new Field( "modified",
        //            DateTools.timeToString( f.lastModified(), DateTools.Resolution.MINUTE ),
        //            Field.Store.YES, Field.Index.NOT_ANALYZED ) );
        // save the content (text files) in the index
        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in the system's default encoding.
        // If that's not the case searching for special characters will fail.
        //Store both position and offset information
        // TextFilesContent = readTextFiles(f.getPath()) + " ";
        doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS));

        // return the document
        //cd.close();
        return doc;
    }
}