gov.va.vinci.leo.cr.BaseFileCollectionReader.java Source code

Java tutorial

Introduction

Here is the source code for gov.va.vinci.leo.cr.BaseFileCollectionReader.java

Source

/**
 *
 */
package gov.va.vinci.leo.cr;

/*
 * #%L
 * Leo
 * %%
 * Copyright (C) 2010 - 2014 Department of Veterans Affairs
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.gson.Gson;
import gov.va.vinci.leo.tools.LeoUtils;
import gov.va.vinci.leo.tools.ConfigurationParameterImpl;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.log4j.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.ConfigurationParameter;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * Get a list of the files in a directory and return them one at a time.  Optionally a
 * recurse flag can also be set which causes the reader to search recursively
 * through the entire folder structure of the parent directory.
 *
 * @author thomasginter
 */
public abstract class BaseFileCollectionReader extends BaseLeoCollectionReader {
    /**
     * Input Directory File object to be searched for available files.
     */
    protected File mInDir = null;

    /**
     * Recurse flag we will search recursively in sub-directories if true.
     * Defaults to false.
     */
    protected boolean mRecurse = false;

    /**
     * Index of the next file to be processed.
     */
    protected int mFileIndex = 0;

    /**
     * Array of File objects being processed.
     */
    protected ArrayList<File> mFileCollection = new ArrayList<File>();

    /**
     * Encoding type for the files being read in. Defaults to system default.
     */
    protected String mEncoding = null;

    /**
     * Filters out the files found by filename extension.
     */
    protected FilenameFilter filenameFilter = null;

    /**
     * Name of the input Directory Parameter.
     */
    public final static String inputDirectoryParam = "inputDirectory";
    /**
     * Name of the recurse flag Parameter.
     */
    public final static String recurseParam = "find_recurse";
    /**
     * Name of the encoding parameter.
     */
    public final static String encodingParam = "encoding";
    /**
     * One or more file name extensions for filtering input files.
     */
    public final static String filterParam = "fileExtensionsFilter";

    /**
     * Logger for class.
     */
    protected Logger LOG = Logger.getLogger(this.getClass());

    /**
     * Default constructor used during UIMA initialization.
     */
    public BaseFileCollectionReader() {

    }

    /**
     * Constructor that sets the input directory to be searched and the recurse flag that
     * controls whether or not the reader will descend in to subdirectories.
     *
     * @param inputDirectory Input directory to be searched
     * @param recurse        Recurse flag will descend into subdirectories if true, defaults to false.
     */
    public BaseFileCollectionReader(File inputDirectory, boolean recurse) {
        this(inputDirectory, recurse, null);
    }//Constructor with inputDirectory and recurse flag input params

    /**
     * Initialize with the input directory to be searched, recurse flag for parsing into subdirectories,
     * and file name filter to be used.
     *
     * @param inputDirectory Input directory to be searched
     * @param recurse        Recurse flag will descend into subdirectories if true, defaults to false
     * @param filter         FileName extention filter to use, if null defaults to <code>.txt</code>
     */
    public BaseFileCollectionReader(File inputDirectory, boolean recurse, SuffixFileFilter filter) {
        this(inputDirectory, null, recurse, filter);
    }

    /**
     * Initialize with the input directory to be searched, recurse flag for parsing into subdirectories,
     * and file name filter to be used.
     *
     * @param inputDirectory Input directory to be searched
     * @param encoding       Encoding format to use when reading in the data
     * @param recurse        Recurse flag will descend into subdirectories if true, defaults to false
     * @param filter         FileName extention filter to use, if null defaults to <code>.txt</code>
     */
    public BaseFileCollectionReader(File inputDirectory, String encoding, boolean recurse,
            SuffixFileFilter filter) {
        if (inputDirectory == null || !inputDirectory.isDirectory()) {
            throw new IllegalArgumentException(
                    "Input Directory must not be null, and must point to an existing directory.");
        }

        filenameFilter = filter;
        setInputDirectory(inputDirectory);
        setRecurseFlag(recurse);
        mFileIndex = 0;
        this.mEncoding = encoding;
    }

    /**
     * This method is called during initialization, and does nothing by default. Subclasses should override it to perform one-time startup logic.
     */
    @Override
    public void initialize() throws ResourceInitializationException {
        super.initialize();
        mInDir = new File((String) getConfigParameterValue(inputDirectoryParam));

        if (getConfigParameterValue(recurseParam) != null) {
            Boolean tmpValue = (Boolean) getConfigParameterValue(recurseParam);
            if (tmpValue) {
                mRecurse = true;
            } //if tmpValue == TRUE
        } //if recurseParam

        if (getConfigParameterValue(encodingParam) != null) {
            mEncoding = (String) getConfigParameterValue(encodingParam);
        } else {
            mEncoding = Charset.defaultCharset().displayName();
        }

        if (getConfigParameterValue(filterParam) != null) {
            filenameFilter = new Gson().fromJson((String) getConfigParameterValue(filterParam),
                    SuffixFileFilter.class);
        }

        //Initialize the collection
        if (mInDir != null) {
            findFiles(mInDir);
        } //if mInDir != null

        mFileIndex = 0;
    }//initialize method

    /**
     * Find the list of files that meet the requirements.
     *
     * @param f the file to search. This should be a directory.
     */
    protected void findFiles(File f) {
        if (f == null || !f.exists())
            return;

        File[] files;
        if (filenameFilter != null) {
            files = f.listFiles(filenameFilter);
            File[] directories = f.listFiles(new FileFilter() {
                public boolean accept(File f) {
                    return f.isDirectory();
                }
            });
            files = ArrayUtils.addAll(files, directories);
        } else {
            files = f.listFiles();
        }

        for (File file : files) {
            if (!file.isDirectory()) {
                mFileCollection.add(file);
            } else if (mRecurse) {
                findFiles(file);
            } //else if mRecurse
        } //for
    }//findFiles method

    /**
     * Set the inputDirectory for this FileSubReader object.
     *
     * @param inputDirectory the input directory to load files from.
     */
    public void setInputDirectory(File inputDirectory) {
        mInDir = inputDirectory;
    }//setInputDirectory method

    /**
     * Set the recurse flag for this property.  The recurse flag controls whether or not this
     * reader will recurse through subdirectories to find files or simply search the parent
     * directory alone.
     *
     * @param recurse  if true, sub directories are also searched, otherwise just the specified directory is
     *                 used for input.
     */
    public void setRecurseFlag(boolean recurse) {
        mRecurse = recurse;
    }//setRecurseFlag method

    /**
     * Return the number of documents in the set.
     *
     * @return the size of the collection.
     *
     */
    public int getCollectionSize() {
        return mFileCollection.size();
    }//getCollectionSize method

    /**
     * @return the index of the current document in the collection.
     */
    public int getCurrentIndex() {
        return mFileIndex;
    }//getCurrentIndex

    /**
     * @return true if there is another document in the collection, false if not.
     * @throws java.io.IOException if there is an error reading the data.
     * @throws org.apache.uima.collection.CollectionException if retrieval of the next file fails
     */
    public boolean hasNext() throws IOException, CollectionException {
        return (mFileIndex < mFileCollection.size());
    }//hasNext method

    /**
     * Get the next file to be processed in the pipeline.
     * @param aCAS the cas to populate with the next document.
     * @throws java.io.IOException if there is an error reading the data.
     * @throws org.apache.uima.collection.CollectionException if retrieval of the next file fails
     */
    public abstract void getNext(CAS aCAS) throws IOException, CollectionException;

    /**
     *
     * @return an array of Progress objects. Each object may have different units (for example number of entities or bytes).
     */
    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(getCurrentIndex(), getCollectionSize(), Progress.ENTITIES) };
    }//getProgress method

    /**
     * Return the encoding format that this CollectionReader will use for the source data.
     *
     * @return encoding format String.
     */
    public String getEncoding() {
        return mEncoding;
    }

    /**
     * Set the file encoding from the encoding string provided.  Determines the kind of encoding to use when reading in
     * source data.
     *
     * @param encoding encoding format to use.
     */
    public void setEncoding(String encoding) {
        this.mEncoding = encoding;
    }

    /**
     * Create a map of parameter names and values from the parameters in the static inner Param class as well as class
     * field variables.  Then return a CollectionReader which has been initialized by the framework using the
     * parameter settings provided.
     *
     * @return CollectionReader object.
     * @throws ResourceInitializationException if there is an error initializing the CollectionReader.
     */
    public CollectionReader produceCollectionReader() throws ResourceInitializationException {
        Map<String, Object> parameterValues = new HashMap<String, Object>();
        parameterValues.put(Param.ENCODING.getName(), mEncoding);
        if (filenameFilter != null) {
            parameterValues.put(Param.FILE_EXTENSIONS_FILTER.getName(), new Gson().toJson(filenameFilter));
        }
        parameterValues.put(Param.FIND_RECURSE.getName(), mRecurse);
        parameterValues.put(Param.INPUT_DIRECTORY.getName(), mInDir.getAbsolutePath());
        return produceCollectionReader(LeoUtils.getStaticConfigurationParameters(Param.class), parameterValues);
    }

    /**
     * Static inner class for holding parameter information.
     */
    public static class Param extends BaseLeoCollectionReader.Param {
        /**
         * Input directory to read from.
         */
        public static ConfigurationParameter INPUT_DIRECTORY = new ConfigurationParameterImpl("inputDirectory",
                "The directory to use for reading in files", ConfigurationParameter.TYPE_STRING, true, false,
                new String[] {});
        /**
         * Determine if inputDirectory is searched resursively or not.
         */
        public static ConfigurationParameter FIND_RECURSE = new ConfigurationParameterImpl("find_recurse",
                "If true, the directory is recursively searched.", ConfigurationParameter.TYPE_BOOLEAN, false,
                false, new String[] {});

        /**
         * The document encoding. (ie UTF-8)
         */
        public static ConfigurationParameter ENCODING = new ConfigurationParameterImpl("encoding",
                "The text encoding used in the file.", ConfigurationParameter.TYPE_STRING, false, false,
                new String[] {});

        /**
         * A file extension filter to only return documents matching a certain file extension.
         */
        public static ConfigurationParameter FILE_EXTENSIONS_FILTER = new ConfigurationParameterImpl(
                "fileExtensionsFilter",
                "A file extension filter to only return documents matching a certain file extension.",
                ConfigurationParameter.TYPE_STRING, false, false, new String[] {});
    }
}//FileSubReader class