io.github.infolis.algorithm.RegexSearcher.java Source code

Introduction

Here is the source code for io.github.infolis.algorithm.RegexSearcher.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package io.github.infolis.algorithm;

import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.util.LimitedTimeMatcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author domi
 * @author kata
 * @author kba
 */
public class RegexSearcher extends BaseAlgorithm {

    public RegexSearcher(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient,
            FileResolver inputFileResolver, FileResolver outputFileResolver) {
        super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
    }

    private static final Logger log = LoggerFactory.getLogger(RegexSearcher.class);

    private String getFileAsString(InfolisFile file) throws IOException {
        InputStream in = getInputFileResolver().openInputStream(file);
        String input = IOUtils.toString(in);
        in.close();
        log.trace("Input: " + input);
        // makes regex matching a bit easier
        return input.replaceAll("\\s+", " ");
    }

    // TODO: use second lucene index here... (case-insensitive whitespaceAnalyzer)
    private List<TextualReference> searchForPatterns(InfolisFile file) throws IOException {
        String inputClean = getFileAsString(file);

        List<TextualReference> res = new ArrayList<>();
        for (String patternURI : this.getExecution().getPatterns()) {
            //debug(log, patternURI);
            log.trace(patternURI);
            InfolisPattern pattern = getInputDataStoreClient().get(InfolisPattern.class, patternURI);
            //debug(log, "Searching for pattern '%s'", pattern.getPatternRegex());
            log.trace("Searching for pattern '%s'", pattern.getPatternRegex());
            Pattern p = Pattern.compile(pattern.getPatternRegex());

            // set upper limit for processing time - prevents stack overflow
            // caused by monitoring process
            // (LimitedTimeMatcher)
            // 750000 suitable for -Xmx2g -Xms2g
            // processing time for documents depends on size of the document.
            // Allow 1024 milliseconds per KB
            InputStream openInputStream = getInputFileResolver().openInputStream(file);
            long maxTimeMillis = Math.min(75_000, openInputStream.available());
            openInputStream.close();

            // call m.find() as a thread: catastrophic backtracking may occur
            // which causes application to hang
            // thus monitor runtime of threat and terminate if processing takes
            // too long
            LimitedTimeMatcher ltm = new LimitedTimeMatcher(p, inputClean, maxTimeMillis,
                    file.getFileName() + "\n" + pattern.getPatternRegex());
            ltm.run();
            // thread was aborted due to long processing time
            if (!ltm.finished()) {
                // TODO: what to do if search was aborted?
                log.error("Search was aborted. TODO");
                // InfolisFileUtils.writeToFile(new
                // File("data/abortedMatches.txt"), "utf-8", filenameIn + ";" +
                // curPat + "\n", true);
            }
            while (ltm.matched()) {
                String context = ltm.group();
                String studyName = ltm.group(1).trim();
                log.debug("found pattern " + pattern.getPatternRegex() + " in " + file);
                log.debug("referenced study name: " + studyName);
                // if studyname contains no characters: ignore
                // TODO: not accurate - include accents etc in match... \p{M}?
                if (studyName.matches("\\P{L}+")) {
                    log.debug("Invalid study name \"" + studyName + "\". Searching for next match of pattern "
                            + pattern.getPatternRegex());
                    ltm.run();
                    continue;
                }
                // a study name is supposed to be a named entity and thus should
                // contain at least one upper-case character
                if (this.getExecution().isUpperCaseConstraint()) {
                    if (studyName.toLowerCase().equals(studyName)) {
                        ltm.run();
                        log.debug("Match does not satisfy uppercase-constraint \"" + studyName
                                + "\". Processing new match...");
                        continue;
                    }
                }

                List<TextualReference> references = SearchTermPosition.getContexts(getOutputDataStoreClient(),
                        file.getUri(), studyName, context);
                for (TextualReference ref : references) {
                    ref.setPattern(pattern.getUri());
                    log.debug("added reference: " + ref);
                }
                res.addAll(references);
                log.trace("Added references.");

                log.trace("Searching for next match of pattern " + pattern.getPatternRegex());
                ltm.run();
            }
        }
        log.trace("Done searching for patterns in " + file);
        return res;
    }

    @Override
    public void execute() throws IOException {
        Execution tagExec = getExecution().createSubExecution(TagResolver.class);
        tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
        tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags());
        tagExec.instantiateAlgorithm(this).run();

        getExecution().getPatterns().addAll(tagExec.getPatterns());
        getExecution().getInputFiles().addAll(tagExec.getInputFiles());

        List<TextualReference> detectedContexts = new ArrayList<>();
        int counter = 0, size = getExecution().getInputFiles().size();
        System.out.println("size: " + size);
        for (String inputFileURI : getExecution().getInputFiles()) {
            counter++;
            log.trace("Input file URI: '{}'", inputFileURI);
            InfolisFile inputFile;
            try {
                inputFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI);
            } catch (Exception e) {
                error(log, "Could not retrieve file " + inputFileURI + ": " + e.getMessage());
                getExecution().setStatus(ExecutionStatus.FAILED);
                persistExecution();
                return;
            }
            if (null == inputFile) {
                throw new RuntimeException("File was not registered with the data store: " + inputFileURI);
            }
            if (null == inputFile.getMediaType()) {
                throw new RuntimeException("File has no mediaType: " + inputFileURI);
            }
            // if the input file is not a text file
            if (!inputFile.getMediaType().startsWith("text/plain")) {
                // if the input file is a PDF file, convert it
                if (inputFile.getMediaType().startsWith("application/pdf")) {
                    Execution convertExec = new Execution();
                    convertExec.setAlgorithm(TextExtractor.class);
                    convertExec.setInputFiles(Arrays.asList(inputFile.getUri()));
                    // TODO wire this more efficiently so files are stored temporarily
                    Algorithm algo = convertExec.instantiateAlgorithm(this);
                    // do the actual conversion
                    algo.run();
                    // Set the inputFile to the file we just created
                    InfolisFile convertedInputFile = algo.getOutputDataStoreClient().get(InfolisFile.class,
                            convertExec.getOutputFiles().get(0));
                    log.debug("Converted {} -> {}", inputFile.getUri(), convertedInputFile.getUri());
                    log.trace("Content: "
                            + IOUtils.toString(algo.getInputFileResolver().openInputStream(convertedInputFile)));
                    inputFile = convertedInputFile;
                } else {
                    throw new RuntimeException(getClass() + " execution / inputFiles "
                            + "Can only search through text files or PDF files");
                }
            }
            log.trace("Start extracting from '{}'.", inputFile);
            updateProgress(counter, size);

            detectedContexts.addAll(searchForPatterns(inputFile));
        }

        for (TextualReference sC : detectedContexts) {
            getOutputDataStoreClient().post(TextualReference.class, sC);
            this.getExecution().getTextualReferences().add(sC.getUri());
        }

        getExecution().setStatus(ExecutionStatus.FINISHED);
        log.debug("No. contexts found: {}", getExecution().getTextualReferences().size());
    }

    @Override
    public void validate() {
        Execution exec = this.getExecution();
        if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty())
                && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) {
            throw new IllegalArgumentException("Must set at least one inputFile!");
        }
        if ((null == exec.getPatterns() || exec.getPatterns().isEmpty())
                && (null == exec.getInfolisPatternTags() || exec.getInfolisPatternTags().isEmpty())) {
            throw new IllegalArgumentException("No patterns given.");
        }
    }
}