pt.ua.tm.neji.batch.FileBatchExecutor.java Source code

Introduction

Here is the source code for pt.ua.tm.neji.batch.FileBatchExecutor.java
Source

/*
 * Copyright (c) 2016 BMD Software and University of Aveiro.
 *
 * Neji is a flexible and powerful platform for biomedical information extraction from text.
 *
 * This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
 * To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/.
 *
 * This project is a free software, you are free to copy, distribute, change and transmit it.
 * However, you may not use it for commercial purposes.
 *
 * It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */

package pt.ua.tm.neji.batch;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.AndFileFilter;
import org.apache.commons.io.filefilter.HiddenFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pt.ua.tm.neji.context.Context;
import pt.ua.tm.neji.context.OutputFormat;
import pt.ua.tm.neji.core.batch.BatchExecutor;
import pt.ua.tm.neji.core.corpus.Corpus;
import pt.ua.tm.neji.core.processor.Processor;
import pt.ua.tm.neji.exception.NejiException;
import pt.ua.tm.neji.processor.filewrappers.InputFile;
import pt.ua.tm.neji.processor.filewrappers.OutputFile;

import java.io.File;
import java.io.FileFilter;
import java.util.*;
import java.util.concurrent.*;

/**
 * Batch pipeline processors executor, with support for concurrent execution of multiple pipeline processors.
 *
 * @author David Campos
 * @author Tiago Nunes
 * @version 1.0
 * @since 1.0
 */
public class FileBatchExecutor extends BatchExecutor {

    private static Logger logger = LoggerFactory.getLogger(FileBatchExecutor.class);
    private String inputFolderPath, outputFolderPath, inputWildcardFilter;
    private int numThreads;
    private boolean compressed;
    private int filesProcessed;
    private Collection<Corpus> processedCorpora;
    private boolean storeDocuments;
    private final boolean addAnnotationsWithoutIDs;

    public FileBatchExecutor(final String inputFolderPath, final String outputFolderPath, final boolean compressed,
            final int numThreads, final String inputWildcardFilter, final boolean storeDocuments,
            final boolean addAnnotationsWithoutIDs) {
        this.inputFolderPath = inputFolderPath;
        this.outputFolderPath = outputFolderPath;
        this.inputWildcardFilter = inputWildcardFilter;
        this.compressed = compressed;
        this.numThreads = numThreads;
        this.filesProcessed = 0;
        this.processedCorpora = new ArrayList<>();
        this.storeDocuments = storeDocuments;
        this.addAnnotationsWithoutIDs = addAnnotationsWithoutIDs;
    }

    public FileBatchExecutor(final String inputFolderPath, final String outputFolderPath, final boolean compressed,
            final int numThreads, final boolean storeDocuments, final boolean addAnnotationsWithoutIDs) {
        this(inputFolderPath, outputFolderPath, compressed, numThreads, null, storeDocuments,
                addAnnotationsWithoutIDs);
    }

    private static FileFilter newFileFilter(String wildcardFilter, boolean compressed) {
        List<String> wildcards = new ArrayList<>();

        if (StringUtils.isNotBlank(wildcardFilter)) {
            wildcards.add(wildcardFilter);
        }
        if (compressed) {
            wildcards.add("*.gz");
        }
        if (wildcards.isEmpty()) {
            wildcards.add("*");
        }

        return new AndFileFilter(new WildcardFileFilter(wildcards), HiddenFileFilter.VISIBLE);
    }

    @Override
    public void run(Class<? extends Processor> processorCls, Context context, Object... args) throws NejiException {
        //        System.setProperty("file.encoding", "UTF-8");

        logger.info("Initializing context...");
        context.initialize();
        logger.info("Installing multi-threading support...");
        context.addMultiThreadingSupport(numThreads);

        //        try {
        //        logger.info("Starting thread pool with support for {} threads...", numThreads);
        //            executor = Executors.newFixedThreadPool(numThreads, new ProcessorThreadFactory());

        StopWatch timer = new StopWatch();
        timer.start();

        //            CorpusDirWalker walker = new CorpusDirWalker(processorCls, context,
        //                    inputWildcardFilter, compressed, storeDocuments, args);
        //
        //        // Store processed corpora
        //            walker.processFiles();
        //
        //            executor.shutdown();
        //            executor.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);

        filesProcessed = processFiles(inputFolderPath, inputWildcardFilter, outputFolderPath, numThreads, context,
                processorCls, args);

        logger.info("Stopped thread pool.");

        logger.info("Terminating context...");
        context.terminate();

        timer.stop();
        logger.info("Processed {} files in {}", filesProcessed, timer.toString());
        //        } catch (IOException | InterruptedException ex) {
        //            throw new NejiException("Problem processing pipeline.", ex);
        //        }
    }

    @Override
    public Collection<Corpus> getProcessedCorpora() {
        return processedCorpora;
        //        return null;
    }

    private int processFiles(final String inputFolderPath, final String inputWildcardFilter,
            final String outputFolderPath, final int numThreads, Context context,
            final Class<? extends Processor> processorCls, Object... args) {

        int filesProcessed = 0;
        File inputFolder = new File(inputFolderPath);
        FileFilter fileFilter = newFileFilter(inputWildcardFilter, compressed);
        File[] files = inputFolder.listFiles(fileFilter);

        logger.info("Starting thread pool with support for {} threads...", numThreads);
        ExecutorService executor = Executors.newFixedThreadPool(numThreads);

        LinkedList<Future> futures = new LinkedList<>();

        for (File file : files) {

            // Make corpus, output file
            Corpus corpus = new Corpus();

            // By default, the corpus identifier is the file name
            corpus.setIdentifier(FilenameUtils.getBaseName(file.getName()));

            // Make in/out corpus wrappers
            InputFile inputFile = new InputFile(corpus, file, compressed);
            List<OutputFile> outputFiles = new ArrayList<>();
            for (OutputFormat outputFormat : context.getConfiguration().getOutputFormats()) {
                File outFile = OutputFile.newOutputFile(outputFolderPath,
                        FilenameUtils.getBaseName(FilenameUtils.getBaseName(file.getName())), outputFormat,
                        compressed);
                outputFiles.add(new OutputFile(corpus, outFile, compressed));
            }

            if (storeDocuments) {
                processedCorpora.add(corpus);
            }

            Processor processor;
            try {
                processor = newProcessor(processorCls, context, inputFile, outputFiles, addAnnotationsWithoutIDs,
                        args);
            } catch (NejiException ex) {
                String m = "There was a problem creating the processor of the file: " + file.getAbsolutePath();
                logger.error(m, ex);
                throw new RuntimeException(m, ex);
            }

            Future submit = executor.submit(processor);
            futures.add(submit);
        }

        logger.info("");
        logger.info("{} file(s) to process.", futures.size());
        logger.info("Started processing...");

        Iterator<Future> it = futures.iterator();
        while (it.hasNext()) {
            Future future = it.next();
            try {
                Object o = future.get();
                future = null;
                it.remove();
                filesProcessed++;
            } catch (ExecutionException | InterruptedException ex) {
                String m = "There was a problem running the processor.";
                logger.error(m, ex);
            }
        }

        executor.shutdown();
        try {
            executor.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
        } catch (InterruptedException ex) {
            String m = "There was a problem executing the processing tasks.";
            logger.error(m, ex);
            throw new RuntimeException(m, ex);
        }

        return filesProcessed;
    }

    //    private class WildcardFileNameFilter implements FilenameFilter {
    //
    //        private Pattern pattern;
    //
    //        public WildcardFileNameFilter(final Pattern pattern) {
    //            this.pattern = pattern;
    //        }
    //
    //        @Override
    //        public boolean accept(File dir, String name) {
    //            return pattern.matcher(name).matches();
    //        }
    //    }
    //
    //    /**
    //     * Walks the input corpus directory and processes files matching filters using a given pipeline processor.
    //     */
    //    private class CorpusDirWalker extends DirectoryWalker<Corpus> {
    //
    //        private final Class<Processor> processorCls;
    //        private final Context context;
    //        private Object[] args;
    //        private boolean storeDocuments;
    //
    //        public CorpusDirWalker(Class<Processor> processorCls,
    //                               Context context, String inputWildcardFilter, boolean compressed, boolean storeDocuments, Object... args) {
    //            super(newFileFilter(inputWildcardFilter, compressed), 1);
    //
    //            this.processorCls = processorCls;
    //            this.context = context;
    //            this.storeDocuments = storeDocuments;
    //            this.args = args;
    //        }
    //
    ////        /**
    ////         * Walks corpus directory and processes all matched files.
    ////         *
    ////         * @return collection of processed Corpus.
    ////         */
    ////        public Collection<Corpus> processFiles() throws IOException {
    ////            Collection<Corpus> processed = new ArrayList<Corpus>();
    ////
    ////            walk(new File(inputFolderPath), processed);
    ////
    ////            return processed;
    ////        }
    //
    //        public void processFiles() throws IOException {
    //            walk(new File(inputFolderPath), null);
    //        }
    //
    //        /**
    //         * Log walked directory name.
    //         */
    //        @Override
    //        protected boolean handleDirectory(File directory, int depth, Collection<Corpus> results) throws IOException {
    //            logger.info("Walking \"{}\"", directory.getAbsolutePath());
    //            return true;
    //        }
    //
    //        /**
    //         * Process file on pipeline.
    //         */
    //        @Override
    //        protected void handleFile(File file, int depth, Collection<Corpus> results) throws IOException {
    //            // Make corpus, output file
    //            Corpus corpus = new Corpus();
    //
    //            // By default, the corpus identifier is the file name
    //            corpus.setIdentifier(FilenameUtils.getBaseName(file.getName()));
    //
    //            File outFile = OutputFileFormat.newOutputFile(
    //                    outputFolderPath, FilenameUtils.getBaseName(FilenameUtils.getBaseName(file.getName())),
    //                    outputFormat, compressed);
    //
    //            // Make in/out corpus wrappers
    //            InputFileFormat inputPile = new InputFileFormat(file, inputFormat, compressed, corpus);
    //            OutputFileFormat outputPile = new OutputFileFormat(outFile, outputFormat, compressed, corpus);
    //
    //            try {
    //                Processor processor = newProcessor(processorCls, context, inputPile, outputPile, args);
    //
    ////                logger.info("Processing \"{}\"...", file.getAbsolutePath());
    //                executor.execute(processor);
    ////                executor.submit(processor);
    //
    ////                if (storeDocuments) {
    ////                    results.add(corpus);
    ////                }
    //
    //                filesProcessed += 1;
    //            } catch (NejiException ex) {
    //                logger.error("Error processing file \"" + file.getAbsolutePath() + "\"", ex);
    //                throw new RuntimeException("Error processing file \"" + file.getAbsolutePath() + "\"", ex);
    //            }
    //        }
    //    }
    //
    //    private class ProcessorThreadFactory implements ThreadFactory {
    //        @Override
    //        public Thread newThread(Runnable r) {
    //            Thread t = new Thread(r);
    //            t.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
    //                @Override
    //                public void uncaughtException(Thread t, Throwable e) {
    //                    LoggerFactory.getLogger(t.getName()).error(e.getMessage(), e);
    //                }
    //            });
    //
    //            return t;
    //        }
    //    }
}