com.nridge.ds.content.ds_content.ContentExtractor.java Source code

Introduction

Here is the source code for com.nridge.ds.content.ds_content.ContentExtractor.java
Source

/*
 * NorthRidge Software, LLC - Copyright (c) 2015.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.nridge.ds.content.ds_content;

import com.nridge.core.app.mgr.AppMgr;
import com.nridge.core.base.field.Field;
import com.nridge.core.base.field.data.DataBag;
import com.nridge.core.base.field.data.DataField;
import com.nridge.core.base.field.data.DataTextField;
import com.nridge.core.base.std.NSException;
import com.nridge.core.base.std.StrUtl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;

import java.io.*;
import java.net.URL;
import java.nio.file.Paths;

/**
 * The ContentExtractor class is responsible for extracting textual content
 * from a file.  The Apache Tika toolkit detects and extracts metadata and
 * text content from various documents - from PPT to CSV to PDF - using
 * existing parser libraries. Tika unifies these parsers under a single
 * interface to allow you to easily parse over a thousand different file
 * types. Tika is useful for search engine indexing, content analysis,
 * translation, and much more.
 *
 * @see <a href="http://tika.apache.org/">Apache Tika</a>
 * @see <a href="http://www.massapi.com/class/org/apache/tika/fork/ForkParser.html">Apache Tika ForkParser</a>
 * @see <a href="http://www.tutorialspoint.com/tika/tika_quick_guide.htm">Apache Tika Tutorial</a>
 *
 * @since 1.0
 * @author Al Cole
 */
public class ContentExtractor {
    private DataBag mBag;
    private final AppMgr mAppMgr;
    private String mCfgPropertyPrefix = Content.CFG_PROPERTY_PREFIX;

    /**
     * Constructor accepts an application manager parameter and initializes
     * the content extractor accordingly.
     *
     * @param anAppMgr Application manager instance.
     */
    public ContentExtractor(AppMgr anAppMgr) {
        mAppMgr = anAppMgr;
    }

    /**
     * Constructor accepts an application manager parameter and initializes
     * the content extractor accordingly.
     *
     * @param anAppMgr Application manager instance.
     * @param aBag Data bag instance to populate with meta data.
     */
    public ContentExtractor(AppMgr anAppMgr, DataBag aBag) {
        mAppMgr = anAppMgr;
        mBag = aBag;
    }

    /**
     * Returns the configuration property prefix string.
     *
     * @return Property prefix string.
     */
    public String getCfgPropertyPrefix() {
        return mCfgPropertyPrefix;
    }

    /**
     * Assigns the configuration property prefix to the document data source.
     *
     * @param aPropertyPrefix Property prefix.
     */
    public void setCfgPropertyPrefix(String aPropertyPrefix) {
        mCfgPropertyPrefix = aPropertyPrefix;
    }

    /**
     * Convenience method that returns the value of an application
     * manager configuration property using the concatenation of
     * the property prefix and suffix values.
     *
     * @param aSuffix Property name suffix.
     * @return Matching property value.
     */
    public String getCfgString(String aSuffix) {
        String propertyName;

        if (StringUtils.startsWith(aSuffix, "."))
            propertyName = mCfgPropertyPrefix + aSuffix;
        else
            propertyName = mCfgPropertyPrefix + "." + aSuffix;

        return mAppMgr.getString(propertyName);
    }

    /**
     * Convenience method that returns the value of an application
     * manager configuration property using the concatenation of
     * the property prefix and suffix values.  If the property is
     * not found, then the default value parameter will be returned.
     *
     * @param aSuffix Property name suffix.
     * @param aDefaultValue Default value.
     *
     * @return Matching property value or the default value.
     */
    public String getCfgString(String aSuffix, String aDefaultValue) {
        String propertyName;

        if (StringUtils.startsWith(aSuffix, "."))
            propertyName = mCfgPropertyPrefix + aSuffix;
        else
            propertyName = mCfgPropertyPrefix + "." + aSuffix;

        return mAppMgr.getString(propertyName, aDefaultValue);
    }

    /**
     * Returns a typed value for the property name identified
     * or the default value (if unmatched).
     *
     * @param aSuffix Property name suffix.
     * @param aDefaultValue Default value to return if property
     *                      name is not matched.
     *
     * @return Value of the property.
     */
    public int getCfgInteger(String aSuffix, int aDefaultValue) {
        String propertyName;

        if (StringUtils.startsWith(aSuffix, "."))
            propertyName = mCfgPropertyPrefix + aSuffix;
        else
            propertyName = mCfgPropertyPrefix + "." + aSuffix;

        return mAppMgr.getInt(propertyName, aDefaultValue);
    }

    /**
     * Returns <i>true</i> if the application manager configuration
     * property value evaluates to <i>true</i>.
     *
     * @param aSuffix Property name suffix.
     *
     * @return <i>true</i> or <i>false</i>
     */
    public boolean isCfgStringTrue(String aSuffix) {
        String propertyValue = getCfgString(aSuffix);
        return StrUtl.stringToBoolean(propertyValue);
    }

    /**
     * Quick test to determine if the file is valid for content
     * extraction.
     *
     * @param aFile File instance.
     *
     * @return <i>true</i> or <i>false</i>
     */
    public boolean isFileValid(File aFile) {
        if ((aFile != null) && (aFile.exists())) {
            long fileSize = aFile.length();
            if (fileSize > 0L)
                return true;
        }

        return false;
    }

    /**
     * Quick test to determine if the file is valid for content
     * extraction.
     *
     * @param aPathFileName Path/File name.
     *
     * @return <i>true</i> or <i>false</i>
     */
    public boolean isFileValid(String aPathFileName) {
        return isFileValid(new File(aPathFileName));
    }

    /**
     * Uses the Tika subsystem to detect the file type.  The details of
     * that detection approach are described on the Content Detection
     * web page.
     *
     * @param aFile File instance.
     *
     * @return String representation of the file type.
     *
     * @see <a href="http://tika.apache.org/1.6/detection.html">Content Detection</a>
     *
     */
    public String detectType(File aFile) {
        Logger appLogger = mAppMgr.getLogger(this, "detectType");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        String contentType = Content.CONTENT_TYPE_DEFAULT;

        if (isFileValid(aFile)) {
            Tika tikaFacade = new Tika();
            try {
                contentType = tikaFacade.detect(aFile);
            } catch (IOException e) {
                String msgStr = String.format("%s: %s", aFile.getAbsolutePath(), e.getMessage());
                appLogger.error(msgStr, e);
            }
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return contentType;
    }

    /**
     * Uses the Tika subsystem to detect the file type.  The details of
     * that detection approach are described on the Content Detection
     * web page.
     *
     * @param aURL URL of the resource.
     *
     * @return String representation of the file type.
     *
     * @see <a href="http://tika.apache.org/1.6/detection.html">Content Detection</a>
     */
    public String detectType(URL aURL) {
        Logger appLogger = mAppMgr.getLogger(this, "detectType");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        String contentType = Content.CONTENT_TYPE_DEFAULT;

        if (aURL != null) {
            Tika tikaFacade = new Tika();
            try {
                contentType = tikaFacade.detect(aURL);
            } catch (IOException e) {
                String msgStr = String.format("%s: %s", aURL.toString(), e.getMessage());
                appLogger.error(msgStr, e);
            }
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return contentType;
    }

    /**
     * Uses the Tika subsystem to detect the file type.  The details of
     * that detection approach are described on the Content Detection
     * web page.
     * The type detection is based on known file name extensions.
     * <p>
     * The given name can also be a URL or a full file path. In such cases
     * only the file name part of the string is used for type detection.
     * </p>
     *
     * @param aName Name of the document.
     *
     * @return String representation of the file type.
     *
     * @see <a href="http://tika.apache.org/1.6/detection.html">Content Detection</a>
     */
    public String detectType(String aName) {
        Logger appLogger = mAppMgr.getLogger(this, "detectType");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        String contentType = Content.CONTENT_TYPE_DEFAULT;

        if (StringUtils.isNotEmpty(aName)) {
            Tika tikaFacade = new Tika();
            contentType = tikaFacade.detect(aName);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return contentType;
    }

    private void addAssignField(String aFieldName, String aFieldValue) {
        if (mBag != null) {
            DataField dataField = mBag.getFieldByName(aFieldName);
            if (dataField == null) {
                dataField = new DataTextField(aFieldName, Field.nameToTitle(aFieldName), aFieldValue);
                mBag.add(dataField);
            } else
                dataField.setValue(aFieldValue);
        }
    }

    /**
     * This method will extract the textual content from the input file
     * and write it to the writer stream.  If a bag instance has been
     * registered with the class, then meta data fields will dynamically
     * be assigned as they are discovered.
     *
     * @param anInFile Input file instance.
     * @param aWriter Output writer stream.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    @SuppressWarnings("deprecation")
    public void process(File anInFile, Writer aWriter) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        if (isFileValid(anInFile)) {
            appLogger.debug(String.format("[%s] %s", detectType(anInFile), anInFile.getAbsolutePath()));

            ForkParser forkParser = null;
            Metadata tikaMetaData = new Metadata();
            tikaMetaData.set(Metadata.RESOURCE_NAME_KEY, anInFile.getName());
            int contentLimit = getCfgInteger("content_limit", Content.CONTENT_LIMIT_DEFAULT);

            InputStream inputStream = null;
            try {
                Parser tikaParser;
                ParseContext parseContext;

                inputStream = TikaInputStream.get(anInFile.toPath());
                if (isCfgStringTrue("tika_fork_parser")) {
                    forkParser = new ForkParser(ContentExtractor.class.getClassLoader(), new AutoDetectParser());
                    String javaCmdStr = getCfgString("tika_fork_java_cmd");
                    if (StringUtils.isNotEmpty(javaCmdStr))
                        forkParser.setJavaCommand(javaCmdStr);
                    int poolSize = getCfgInteger("tika_fork_pool_size", 5);
                    if (poolSize > 0)
                        forkParser.setPoolSize(poolSize);
                    tikaParser = forkParser;
                    parseContext = new ParseContext();
                } else {
                    tikaParser = new AutoDetectParser();
                    parseContext = new ParseContext();
                    Parser recursiveMetadataParser = new RecursiveMetadataParser(tikaParser);
                    parseContext.set(Parser.class, recursiveMetadataParser);
                }

                WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(aWriter, contentLimit);
                tikaParser.parse(inputStream, writeOutContentHandler, tikaMetaData, parseContext);
            } catch (Exception e) {
                String eMsg = e.getMessage();
                String msgStr = String.format("%s: %s", anInFile.getAbsolutePath(), eMsg);

                /* The following logic checks to see if this exception was triggered simply because
                the total character limit threshold was hit.  If that is all it was, then return true. */

                if (StringUtils.startsWith(eMsg, "Your document contained more than"))
                    appLogger.warn(msgStr);
                else
                    throw new NSException(msgStr);
            } finally {
                if (inputStream != null)
                    IOUtils.closeQuietly(inputStream);
            }

            if ((mBag != null) && (isCfgStringTrue("content_metadata"))) {
                String mdValue;
                String[] metaDataNames = tikaMetaData.names();
                for (String mdName : metaDataNames) {
                    mdValue = tikaMetaData.get(mdName);
                    if (StringUtils.isNotEmpty(mdValue))
                        addAssignField(Content.CONTENT_FIELD_METADATA + mdName, mdValue);
                }
            }

            if (forkParser != null)
                forkParser.close();
        } else {
            String msgStr = String.format("%s: Does not exist or is empty.", anInFile.getAbsolutePath());
            throw new NSException(msgStr);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    /**
     * This method will extract the textual content from the URL
     * and write it to the writer stream.  If a bag instance has been
     * registered with the class, then meta data fields will dynamically
     * be assigned as they are discovered.
     *
     * @param aURL URL of the resource.
     * @param aWriter Output writer stream.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    @SuppressWarnings("deprecation")
    public void process(URL aURL, Writer aWriter) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        if ((aURL == null) || (aWriter == null))
            throw new NSException("One or more parameters are null.");

        String documentName = aURL.toString();
        appLogger.debug(String.format("[%s] %s", detectType(aURL), documentName));

        Metadata tikaMetaData = new Metadata();
        int contentLimit = getCfgInteger("content_limit", Content.CONTENT_LIMIT_DEFAULT);

        InputStream inputStream = null;
        try {
            Parser tikaParser;
            ParseContext parseContext;

            inputStream = TikaInputStream.get(aURL);
            if (isCfgStringTrue("tika_fork_parser")) {
                ForkParser forkParser = new ForkParser(ContentExtractor.class.getClassLoader(),
                        new AutoDetectParser());
                String javaCmdStr = getCfgString("tika_fork_java_cmd");
                if (StringUtils.isNotEmpty(javaCmdStr))
                    forkParser.setJavaCommand(javaCmdStr);
                int poolSize = getCfgInteger("tika_fork_pool_size", 5);
                if (poolSize > 0)
                    forkParser.setPoolSize(poolSize);
                tikaParser = forkParser;
                parseContext = new ParseContext();
            } else {
                tikaParser = new AutoDetectParser();
                parseContext = new ParseContext();
                Parser recursiveMetadataParser = new RecursiveMetadataParser(tikaParser);
                parseContext.set(Parser.class, recursiveMetadataParser);
            }

            WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(aWriter, contentLimit);
            tikaParser.parse(inputStream, writeOutContentHandler, tikaMetaData, parseContext);
        } catch (Exception e) {
            String eMsg = e.getMessage();
            String msgStr = String.format("%s: %s", documentName, eMsg);

            /* The following logic checks to see if this exception was triggered simply because
            the total character limit threshold was hit.  If that is all it was, then return true. */

            if (StringUtils.startsWith(eMsg, "Your document contained more than"))
                appLogger.warn(msgStr);
            else
                throw new NSException(msgStr);
        } finally {
            if (inputStream != null)
                IOUtils.closeQuietly(inputStream);
        }

        if ((mBag != null) && (isCfgStringTrue("content_metadata"))) {
            String mdValue;
            String[] metaDataNames = tikaMetaData.names();
            for (String mdName : metaDataNames) {
                mdValue = tikaMetaData.get(mdName);
                if (StringUtils.isNotEmpty(mdValue))
                    addAssignField(Content.CONTENT_FIELD_METADATA + mdName, mdValue);
            }
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    /**
     * This method will extract the textual content from the input file
     * and write it to the output file.  If a bag instance has been
     * registered with the class, then meta data fields will dynamically
     * be assigned as they are discovered.
     *
     * @param anInFile Input file instance.
     * @param anOutFile Output file instance.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public void process(File anInFile, File anOutFile) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        BufferedWriter bufferedWriter;
        String contentEncoding = getCfgString("content_encoding", StrUtl.CHARSET_UTF_8);
        try (FileOutputStream fileOutputStream = new FileOutputStream(anOutFile)) {
            bufferedWriter = new BufferedWriter(new OutputStreamWriter(fileOutputStream, contentEncoding));
            process(anInFile, bufferedWriter);
        } catch (IOException e) {
            String msgStr = String.format("%s: %s", anInFile.getAbsolutePath(), e.getMessage());
            appLogger.error(msgStr, e);
            throw new NSException(e);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    /**
     * This method will extract the textual content from the input file
     * name and write it to the output file name.  If a bag instance has been
     * registered with the class, then meta data fields will dynamically
     * be assigned as they are discovered.
     *
     * @param anInputPathFileName Input path/file name.
     * @param anOutputPathFileName Output path/file name.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public void process(String anInputPathFileName, String anOutputPathFileName) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        process(new File(anInputPathFileName), new File(anOutputPathFileName));

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }

    /**
     * This method will extract the textual content from the input file
     * and capture it in a string.  If a bag instance has been registered
     * with the class, then meta data fields will dynamically be assigned
     * as they are discovered.
     *
     * @param anInFile Input file instance.
     *
     * @return String representation of the textual content.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public String process(File anInFile) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        StringWriter stringWriter = new StringWriter();
        try (PrintWriter printWriter = new PrintWriter(stringWriter)) {
            process(anInFile, printWriter);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return stringWriter.toString();
    }

    /**
     * This method will extract the textual content from the URL and
     * capture it in a string.  If a bag instance has been registered
     * with the class, then meta data fields will dynamically be assigned
     * as they are discovered.
     *
     * @param aURL URL of the resource.
     *
     * @return String representation of the textual content.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public String process(URL aURL) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        StringWriter stringWriter = new StringWriter();
        try (PrintWriter printWriter = new PrintWriter(stringWriter)) {
            process(aURL, printWriter);
        }

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return stringWriter.toString();
    }

    /**
     * This method will extract the textual content from the input file
     * and capture it in a string.  If a bag instance has been registered
     * with the class, then meta data fields will dynamically be assigned
     * as they are discovered.
     *
     * @param anInputPathFileName Input path/file name.
     *
     * @return String representation of the textual content.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public String process(String anInputPathFileName) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        String contentString = process(new File(anInputPathFileName));

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);

        return contentString;
    }

    /**
     * This method will extract the textual content from the input file
     * and capture it in the content field.  If a bag instance has been
     * registered with the class, then meta data fields will dynamically
     * be assigned as they are discovered.
     *
     * @param anInputPathFileName Input path/file name.
     * @param aContentField Content data field instance.
     *
     * @throws NSException Thrown when IOExceptions are detected.
     */
    public void process(String anInputPathFileName, DataField aContentField) throws NSException {
        Logger appLogger = mAppMgr.getLogger(this, "process");

        appLogger.trace(mAppMgr.LOGMSG_TRACE_ENTER);

        if (aContentField == null)
            throw new NSException("Content data field is null.");

        String contentString = process(anInputPathFileName);
        if (StringUtils.isNotEmpty(contentString))
            aContentField.setValue(contentString);

        appLogger.trace(mAppMgr.LOGMSG_TRACE_DEPART);
    }
}