org.opensextant.xtext.XText.java Source code

Introduction

Here is the source code for org.opensextant.xtext.XText.java
Source

/**
 *
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 */
///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____                                ____                     __                       __
///\  __`\                             /\  _`\                  /\ \__                   /\ \__
//\ \ \/\ \   _____      __     ___    \ \,\L\_\      __   __  _\ \ ,_\     __       ___ \ \ ,_\
// \ \ \ \ \ /\ '__`\  /'__`\ /' _ `\   \/_\__ \    /'__`\/\ \/'\\ \ \/   /'__`\   /' _ `\\ \ \/
//  \ \ \_\ \\ \ \L\ \/\  __/ /\ \/\ \    /\ \L\ \ /\  __/\/>  </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
//   \ \_____\\ \ ,__/\ \____\\ \_\ \_\   \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
//    \/_____/ \ \ \/  \/____/ \/_/\/_/    \/_____/ \/____/\//\/_/  \/__/ \/__/\/_/ \/_/\/_/ \/__/
//            \ \_\
//             \/_/
//
//   OpenSextant XText
// *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
// */
package org.opensextant.xtext;

import static org.apache.commons.lang3.StringUtils.isBlank;
import gnu.getopt.LongOpt;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import javax.activation.MimeType;
import javax.activation.MimeTypeParseException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.tika.io.IOUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.collectors.ArchiveNavigator;
import org.opensextant.xtext.collectors.mailbox.OutlookPSTCrawler;
import org.opensextant.xtext.converters.DefaultConverter;
import org.opensextant.xtext.converters.EmbeddedContentConverter;
import org.opensextant.xtext.converters.ImageMetadataConverter;
import org.opensextant.xtext.converters.MessageConverter;
import org.opensextant.xtext.converters.TextTranscodingConverter;
import org.opensextant.xtext.converters.TikaHTMLConverter;
import org.opensextant.xtext.converters.WebArchiveConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * Traverse a folder and return text versions of the documents found. Archiving
 * the text only copies at an output location of your choice.
 *
 * <pre>
 *
 * if input is a file, convert. Done.
 *
 * if input is an archive, unpack in temp space, iterate over dir, convert each.
 * Done
 *
 * if input is a folder iterate over dir, convert each. Done
 * </pre>
 *
 * TEXT OUTPUT form includes a JSON document header with metadata properties
 * from the original item. These are valid elements of the conversion process.
 * We try to maintain them apart from the true, readable text of the document.
 *
 *
 * Add a ConversiontListener to XText instance to capture the converted document
 * as it comes out of the main loop for converting archives and folders.
 *
 * extractText() runs over any file type and extracts text, saving it pushing
 * events to one optional listener
 *
 * convertFile(File) will convert a single file, returning a ConvertedDocument
 *
 *
 *
 * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
 */
public final class XText implements ExclusionFilter, Converter {

    private final Logger log = LoggerFactory.getLogger(getClass());
    private boolean scrubHTML = false;

    private final PathManager paths = new PathManager();

    public PathManager getPathManager() {
        return paths;
    }

    /**
     * flag to manage if children are extracted or not.
     */
    private boolean extractEmbedded = false;

    /**
     * XText default is 1 MB of text
     */
    private int maxBuffer = DefaultConverter.MAX_TEXT_SIZE;
    /**
     * Heuristic - HTML content is likely 5x, maybe a lot more, the size of the
     * plain text it contains. So with 1 MB the target max text size, 5 MB would be
     * the largest HTML document accepted here, by default.
     */
    private final int maxHTMLBuffer = 5 * maxBuffer;
    private long maxFileSize = FILE_SIZE_LIMIT;

    protected Set<String> archiveFileTypes = new HashSet<String>();

    /**
     *
     */
    public static Map<String, Converter> converters = new HashMap<String, Converter>();
    private Converter defaultConversion;
    private Converter embeddedConversion;
    private final Set<String> requestedFileTypes = new HashSet<String>();
    private final Set<String> ignoreFileTypes = new HashSet<String>();
    private boolean allowNoExtension = false;

    /**
     */
    public XText() {
        defaults();
    }

    public void enableOverwrite(boolean b) {
        ConvertedDocument.overwrite = b;
    }

    /**
     * Sets the archive dir.
     *
     * @param root
     *            the new archive dir
     * @throws IOException
     *             on err
     * @deprecated use getPathManager().setConversionRoot( path )
     */
    @Deprecated
    public void setArchiveDir(String root) throws IOException {
        paths.setConversionCache(root);
    }

    public void setMaxBufferSize(int sz) {
        maxBuffer = sz;
    }

    public void setMaxFileSize(int sz) {
        maxFileSize = sz;
    }

    /**
     * Set if your app requires file extensions or not.
     * 
     * @param b
     *            true to enable
     */
    public void enableNoFileExtension(boolean b) {
        allowNoExtension = b;
    }

    /**
     * Use Tika HTML de-crapifier. Default: No scrubbing.
     *
     * @param b
     *            true if you wish to de-crapify, I mean scrape HTML content
     */
    public void enableHTMLScrubber(boolean b) {
        scrubHTML = b;
    }

    /**
     * enable/disable the extraction of embedded child documents in found documents.
     * Using embedded extraction may yield many small sub documents, aka children.
     * 
     * @param b
     *            true to enable
     */
    public void enableEmbeddedExtraction(boolean b) {
        extractEmbedded = b;
    }

    /**
     * The overall flag to save converted output or not. DEFAULT: true = save
     * it; provided caller specifies either saveWithInput or provides an
     * archiveRoot
     *
     * @param b
     *            true to enable
     */
    public void enableSaving(boolean b) {
        paths.enableSaving(b);
    }

    /**
     * Add the file extension for the file type you wish to convert. if Tika
     * supports it by default it should be no problem.
     * Adding requested file types here only allows the API to know by-file extension
     * what types to filter in and convert. Without a file extension, the file
     * still needs to be ingested and converted to identify the file type.
     * 
     * @param ext
     *            a file extension to convert
     */
    public void convertFileType(String ext) {
        requestedFileTypes.add(ext.toLowerCase());
    }

    /**
     * Ignore files ending with.... or of type ext. No assumption of case is
     * made. This is case sensitive.
     * 
     * @param ext
     *            a file extension to NOT convert
     */
    public void ignoreFileType(String ext) {
        if (ext != null) {
            ignoreFileTypes.add(ext.toLowerCase());
        }
    }

    private ConversionListener postProcessor = null;

    /**
     * A conversion listener is any outside application or routine that will do
     * something more with the converted document. If unset nothing happens. ;)
     * 
     * @param processor
     *            a lisenter that handles the documents that have been found
     */
    public void setConversionListener(ConversionListener processor) {
        postProcessor = processor;
    }

    private boolean useTikaPST = false;

    public void enableTikaPST(boolean flag) {
        useTikaPST = flag;
    }

    /**
     * is the input an archive?.
     *
     * @param fpath
     *            the fpath
     * @return true, if is archive
     */
    public boolean isArchive(String fpath) {
        String ext = FilenameUtils.getExtension(fpath);
        if (ext == null) {
            return false;
        }
        return archiveFileTypes.contains(ext.toLowerCase());
    }

    public boolean isPST(String fpath) {
        return isPSTExtension(FilenameUtils.getExtension(fpath));
    }

    public static boolean isPSTExtension(String ext) {
        if (ext == null) {
            return false;
        }
        return ("pst".equalsIgnoreCase(ext));
    }

    protected long total_conv_time = 0;
    protected int average_conv_time = 0;
    protected int total_conversions = 0;

    /**
     * Records overall counts and conversion times for documents converted.
     * This may not account for error'd documents.
     *
     * @param d
     *            ConvertedDocument
     */
    protected void trackStatistics(ConvertedDocument d) {
        if (d != null) {
            total_conv_time += d.conversion_time;
        }
        ++total_conversions;
    }

    public void reportStatistics() {
        average_conv_time = (int) ((float) total_conv_time / total_conversions);
        log.info("TOTAL of N=" + total_conversions + " documents converted" + "\n With an average time (ms) of "
                + average_conv_time);
    }

    protected long start_time = 0;
    protected long stop_time = 0;

    /**
     * Optional API routine. If XText is used as a main program, this is the entry point for extraction/collection.
     * If XText is used as an API, caller may use convertFile() directly without engaging in the setup and assumptions
     * behind this convenience method.
     * The main entry point to converting compound documents and folders.
     * 
     * @param filepath
     *            item from which we extract text
     * @throws IOException
     *             err
     * @throws ConfigException
     *             err
     */
    public void extractText(String filepath) throws IOException, ConfigException {

        start_time = System.currentTimeMillis();

        log.info("Conversion.  INPUT PATH={}", filepath);
        String path = FilenameUtils.normalize(new File(filepath).getAbsolutePath(), true);
        if (path == null) {
            throw new IOException("Failed to normalize the path: " + filepath);
        }

        File input = new File(path);
        if (!input.exists()) {
            throw new IOException("Non existent input FILE=" + path);
        }

        /* Filter on absolute path */
        if (PathManager.isXTextCache(path)) {
            throw new ConfigException("XText cannot be directed to extract text from its own cache files. "
                    + "Move the cache files out of ./xtext/ folders if you really need to do this.");
        }

        if (isArchive(input.getName())) {
            // Archive will collect originals to "export"
            // Archive will save conversions to "output"
            // PathManager is STATEFUL for as long as this archive is processing
            // If an archive is uncovered while traversing files, its contents can be dumped to the child export folder.
            convertArchive(input);
        } else if (isPST(input.getName()) && !useTikaPST) {
            this.convertOutlookPST(input);
        } else if (input.isFile()) {
            // If prefix is not set, then conversion will be dumped flatly to output area.
            paths.setInputRoot(input);
            convertFile(input);
        } else if (input.isDirectory()) {
            paths.setInputRoot(input);
            convertFolder(input);
        }

        stop_time = System.currentTimeMillis();

        if (paths.isSaving()) {
            if (paths.isSaveWithInput()) {
                log.info("Output can be accessed at from the input folder {} in 'xtext' sub-folders",
                        input.getParent());
            } else {
                log.info("Output can be accessed at " + paths.getConversionCache());
            }
        }

        reportStatistics();
    }

    /**
     * Filter out File object if it is an XText conversion of some sort. That is, if
     * file "./a/b/c/xtext/file.doc.txt  is found, it is omitted because it is contained in "./xtext"
     *
     * @param input
     *            file obj
     * @return true if file's immediate parent is named 'xtext'
     */
    private boolean filterOutFile(File input) {
        //
        //
        if (PathManager.isXTextCache(input)) {
            return true;
        }

        return filterOutFile(input.getAbsolutePath());
    }

    /**
     * Filter the type of files to ignore.
     */
    @Override
    public boolean filterOutFile(String filepath) {

        // Filter out any of our own xtext caches
        //
        if (PathManager.isXTextCache(filepath)) {
            return true;
        }

        String n = FilenameUtils.getBaseName(filepath);
        if (n.startsWith(".")) {
            return true;
        }
        if (filepath.contains(".svn")) {
            return true;
        }

        // ignore '-utf8.txt' as XText likely generated them.
        //
        if (n.endsWith(ConvertedDocument.CONVERTED_TEXT_EXT)) {
            return true;
        }

        String ext = FilenameUtils.getExtension(filepath);
        if (isBlank(ext)) {
            if (allowNoExtension) {
                return false;
            }
            return true;
        }
        return !requestedFileTypes.contains(ext.toLowerCase());
    }

    /**
     * Unpack an archive and convert items found.
     * Given (input)/A.zip
     * The zip is dearchived to
     * (input)/A_zip/
     * or (archive)/(input)/A_zip
     *
     * Items are then converted in either folder for the conversion archiving; depending on your choice of embedded vs.
     * non-embedded
     * 
     * @param input
     *            archive file object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     *
     */
    public void convertArchive(File input) throws IOException, ConfigException {

        if (!paths.verifyArchiveExport(input.getAbsolutePath())) {
            return;
        }

        File saveFolder = paths.getArchiveExportDir(input);
        String savePrefix = paths.getStipPrefixPath();

        paths.setStripPrefixPath(saveFolder.getAbsolutePath());
        paths.setInputRoot(saveFolder);

        ArchiveNavigator deArchiver = new ArchiveNavigator(input, saveFolder.getAbsolutePath(), this, this);
        deArchiver.overwrite = ConvertedDocument.overwrite;

        log.info("\tArchive Found ({}). Expanding to {}", input, saveFolder);

        deArchiver.collect();

        // Done:
        paths.setStripPrefixPath(savePrefix);
    }

    /**
     *
     * @param input
     *            input PST object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public void convertOutlookPST(File input) throws ConfigException, IOException {
        if (!paths.isSaving()) {
            log.error(
                    "Warning -- PST file found, but save = true is required to parse it.  Enable saving and chose a cache folder");
        }

        OutlookPSTCrawler pst = new OutlookPSTCrawler(input);
        pst.setConverter(this);
        pst.overwriteMode = ConvertedDocument.overwrite;
        pst.incrementalMode = true;

        File saveFolder = paths.getArchiveExportDir(input);
        String savePrefix = paths.getStipPrefixPath();

        paths.setStripPrefixPath(saveFolder.getAbsolutePath());
        paths.setInputRoot(saveFolder);
        pst.setOutputPSTDir(saveFolder);
        pst.configure();

        log.info("\tPST Email Archive Found ({}). Expanding to {}", input, saveFolder);

        try {
            pst.collect();
        } catch (Exception err) {
            throw new ConfigException("Unable to fully digest PST file " + input, err);
        }

        // Done:
        paths.setStripPrefixPath(savePrefix);
    }

    /**
     * Arbitrary 32 MB limit on file size. Maybe this should be dependent on the
     * file type.
     */
    public static final long FILE_SIZE_LIMIT = 0x2000000;

    /**
     * This is the proxy interface for traversing archives.
     *
     * Archive Navigator will call this interface to convert and post-process So
     * XText itself is a super-converter, whereas the items in the converter pkg
     * are stateless, simple conversions.
     *
     * this interface implementation calls XText.convertFile() which in turn
     * deals with the details of saving and archiving items
     *
     * Items retrieved from Archive Navigator are deleted from their temp space.
     * 
     * @param input
     *            file
     * @throws ConfigException
     *             on err
     * @throws IOException
     *             on err
     */
    @Override
    public ConvertedDocument convert(File input) throws IOException, ConfigException {
        return convertFile(input);
    }

    /**
     * Unsupported iConvert interface method. To convert text from a String obj
     * rather than a File obj, you would instantiate a converter implementation
     * for the data you think you are converting. E.g., if you know you have a
     * buffer of HTML content and want to save it as text, call
     * TikaHTMLConverter().convert( buffer ) directly.
     *
     * @param data
     *            raw data
     * @return the converted document
     * @throws IOException
     *             on err
     */
    @Override
    public ConvertedDocument convert(String data) throws IOException {
        throw new IOException("Unsupported interface:  To convert text or binary data directly "
                + "you must use an instance of a XText converter, e.g., TikaHTMLConverter");
    }

    /**
     * Convert file.
     *
     * @param input
     *            the input
     * @return the converted document
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public ConvertedDocument convertFile(File input) throws IOException, ConfigException {
        return convertFile(input, null);
    }

    /**
     * Convert one file and save it off. We ignore hidden files and files in
     * hidden folders, e.g., .cvs_ignore, mycode/.svn/abc.txt
     *
     * This is the end of the line for the conversion logic; convertFile figures
     * out if it should return the cached version or attempt a conversion; it
     * also tries to save children items As children items may require special
     * attention they are not converted -- caller can pass in ConversionListener
     * and can deal with children file objects on their end.
     *
     * @param input
     *            child input obj to convert
     * @param parent
     *            parent in which child was found
     * @return converted document object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public ConvertedDocument convertFile(File input, ConvertedDocument parent) throws IOException, ConfigException {

        if (parent == null && filterOutFile(input)) {
            return null;
        }

        if (paths.isSaving()) {
            if (!paths.isSaveWithInput() && !paths.hasInputRoot()) {

                throw new IOException(
                        "Please set an input root; convertFile() was called in save/cache mode without having PathManager setup");
            }
        }

        String fname = input.getName();

        String ext = FilenameUtils.getExtension(fname).toLowerCase();
        if (!allowNoExtension) {
            if (ignoreFileTypes.contains(ext)) {
                return null;
            }

            if (!requestedFileTypes.contains(ext)) {
                return null;
            }
        }

        log.debug("Converting FILE=" + input.getAbsolutePath());

        /*
         * Handle archives or PST files. Or other large compound single file.
         */
        if (isArchive(fname)) {
            convertArchive(input);

            // NULL here implies the actual file, A.zip does not have any text representation itself.
            // However its children do.
            return null;
        } else if (isPSTExtension(ext) && !useTikaPST) {
            convertOutlookPST(input);
            return null;
        }

        /*
         * Otherwise this is a normal file...
         */
        if (FileUtils.sizeOf(input) > maxFileSize) {
            log.info("Valid File is too large FILE=" + input.getAbsolutePath());
            return null;
        }

        boolean cachable = true;
        Converter converter = converters.get(ext);
        if (converter == null) {
            if (extractEmbedded && EmbeddedContentConverter.isSupported(ext)) {
                converter = embeddedConversion;
                cachable = false; // Such content is processed every time.  Oh well...
            } else {
                converter = defaultConversion;
            }
        }

        ConvertedDocument textDoc = null;

        // ------------------
        // Retrieve previous conversions
        // ------------------
        if (cachable && !ConvertedDocument.overwrite && paths.isSaving()) {
            textDoc = paths.getCachedConversion(input);
        }

        // ------------------
        // Convert or Read object, IFF no cache exists for that object.
        // ------------------
        if (textDoc == null) {
            // Measure how long conversions take.
            long t1 = System.currentTimeMillis();

            try {
                textDoc = converter.convert(input);
            } catch (Exception convErr) {
                throw new IOException("Conversion error FILE=" + input.getPath(), convErr);
            }
            long t2 = System.currentTimeMillis();
            int duration = (int) (t2 - t1);
            if (textDoc != null) {
                // Buffer can be null. If you got this far, you are interested
                // in the file, as it passed
                // all filters above. Return the document with whatever metadata
                // it found.
                // if (textDoc.buffer == null) {
                // throw new
                // IOException("Engineering error: Doc converted, but converter failed to setText()");
                // }
                if (paths.isSaving() && textDoc.is_converted) {
                    // Get Parent info in there.
                    if (parent != null) {
                        textDoc.setParent(parent);
                    }

                    paths.saveConversion(textDoc);

                    // Children items will be persisted in the same folder
                    // structure where the textdoc.textpath resides.
                    // That is, Email or Embedded objects will be parsed are
                    // saved in ./xtext/ folder or in the separate archive.
                    // But this must be down now, as we have all the dynamic
                    // metadata + raw artifacts; As it is all written out to
                    // disk,
                    // it will be written out together.
                    //
                    if (textDoc.hasRawChildren()) {
                        convertChildren(textDoc);

                        // 1. children saved to disk
                        // 2. children converted.
                        // 3. children attached to parent here.
                        // 'textdoc' should now be well endowed with all the
                        // children metadata.
                    }
                }
            } else {
                textDoc = new ConvertedDocument(input);
            }

            textDoc.conversion_time = duration;
            if (textDoc.filetime == null) {
                textDoc.filetime = textDoc.getFiletime();
            }
        }

        /*
         * Conversion Listeners are called only for parent documents. That is
         * for an email with 4 attachments, this listener is called on the
         * parent email message, but not for the individual 4 attachments. The
         * final parent document here will have all Raw Children (bytes +
         * metadata) and Converted Children (ConvertedDocument obj) Caller will
         * have to detect if returned item via listener is a Parent with
         * Children.
         *
         * Behavior here is TBD.
         */
        if (postProcessor != null && parent == null) {
            postProcessor.handleConversion(textDoc, input.getAbsolutePath());
        }

        trackStatistics(textDoc);
        return textDoc;
    }

    /**
     * Navigate a folder trying to convert each file and return something to the
     * listener. Do not sacrifice the entire job if one file fails, so exception
     * is trapped in loop
     *
     * @param input
     *            the input
     * @throws IOException
     *             on err
     */
    public void convertFolder(File input) throws IOException {
        java.util.Collection<File> files = FileUtils.listFiles(input,
                new SuffixFileFilter(fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter());
        for (File f : files) {
            try {
                convertFile(f);
            } catch (Exception convErr) {
                log.error("Conversion error, FILE=" + f.getPath(), convErr);
            }
        }
    }

    /**
     * Save children objects for a given ConvertedDocument to a location....
     * convert those items immediately, saving the Parent metadata along with
     * them. You should have setParent already
     *
     * @param parentDoc
     *            parent conversion
     * @throws IOException
     *             on err
     */
    public void convertChildren(ConvertedDocument parentDoc) throws IOException {

        if (parentDoc.is_webArchive) {
            // Web Archive is a single document.  Only intent here is to convert to a single text document.
            //
            return;
        }

        parentDoc.evalParentChildContainer();
        FileUtility.makeDirectory(parentDoc.parentContainer);
        String targetPath = parentDoc.parentContainer.getAbsolutePath();

        for (Content child : parentDoc.getRawChildren()) {
            if (child.content == null) {
                log.error("Attempted to write out child object with no content {}", child.id);
                continue;
            }

            OutputStream io = null;
            try {
                // We just assume for now Child ID is filename.
                // Alternatively, child.meta.getProperty(
                // ConvertedDocument.CHILD_ENTRY_KEY )
                // same result, just more verbose.
                //
                File childFile = new File(FilenameUtils.concat(targetPath, child.id));
                io = new FileOutputStream(childFile);
                IOUtils.write(child.content, io);

                ConvertedDocument childConv = convertFile(childFile, parentDoc);
                if (childConv != null) {
                    if (childConv.is_converted) {
                        // Push down all child metadata down to ConvertedDoc
                        for (String k : child.meta.stringPropertyNames()) {
                            String val = child.meta.getProperty(k);
                            childConv.addUserProperty(k, val);
                        }
                        // Save cached version once again.
                        childConv.saveBuffer(new File(childConv.textpath));
                    }

                    if (child.mimeType != null) {
                        try {
                            childConv.setMimeType(new MimeType(child.mimeType));
                        } catch (MimeTypeParseException e) {
                            log.warn("Invalid mime type encountered: {} ignoring.", child.mimeType);
                        }
                    }

                    parentDoc.addChild(childConv);
                }
            } catch (Exception err) {
                log.error("Failed to write out child {}, but will continue with others", child.id, err);
            } finally {
                if (io != null) {
                    io.close();
                }
            }
        }
    }

    /**
     * TODO: this is called by default. duh. To change behavior, adjust
     * settings before setup() is called
     */
    public void defaults() {

        archiveFileTypes.add("zip");
        archiveFileTypes.add("gz");
        archiveFileTypes.add("tar");
        archiveFileTypes.add("tgz");
        archiveFileTypes.add("tar.gz");
        // archive_types.add("7z");

        // Get from a config file.
        requestedFileTypes.add("doc");
        requestedFileTypes.add("docx");
        requestedFileTypes.add("pdf");
        requestedFileTypes.add("htm");
        requestedFileTypes.add("html");
        requestedFileTypes.add("txt"); // only for encoding conversions.
        requestedFileTypes.add("msg");
        requestedFileTypes.add("eml");
        requestedFileTypes.add("emlx");
        requestedFileTypes.add("ppt");
        requestedFileTypes.add("pptx");
        requestedFileTypes.add("xlsx");
        requestedFileTypes.add("xls");
        requestedFileTypes.add("rtf");

        // Testing:
        requestedFileTypes.add("dot");
        requestedFileTypes.add("dotx");
        requestedFileTypes.add("odt");
        requestedFileTypes.add("odf");
        requestedFileTypes.add("docm");

        // Web Archives.
        requestedFileTypes.add("mht");
        //requestedFileTypes.add("wps");  MS Works?  No tika support really.

        // Only Photographic images will be supported by default.
        // BMP, GIF, PNG, ICO, etc. must be added by caller.
        //
        requestedFileTypes.add("jpg");
        requestedFileTypes.add("jpeg");

        // Limited PST support here.  PST will not behave the same as other files.
        // Its closer to a Zip archive than an ordinary file.
        requestedFileTypes.add("pst");

        // requested_types.add("log"); // Uncommon. Caller must expclitly add
        // raw data types and archives.
    }

    /**
     * Start over.
     */
    public void clearSettings() {
        requestedFileTypes.clear();
        converters.clear();
    }

    /**
     * If by this point you have taken items out of the requested types the
     * converters will not be setup. E.g., if you don't want PDF or HTML
     * conversion - those resources will not be initialized.
     * 
     * @throws IOException
     *             on err
     */
    public void setup() throws IOException {

        defaultConversion = new DefaultConverter(maxBuffer);
        embeddedConversion = new EmbeddedContentConverter(maxBuffer);

        paths.configure();

        // Invoke converter instances only as requested types suggest.
        // If caller has removed file types from the list, then

        String mimetype = "txt";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, new TextTranscodingConverter());
        }

        mimetype = "html";
        if (requestedFileTypes.contains(mimetype)) {
            Converter webConv = new TikaHTMLConverter(this.scrubHTML, maxHTMLBuffer);
            converters.put(mimetype, webConv);
            converters.put("htm", webConv);
            converters.put("xhtml", webConv);

            requestedFileTypes.add("htm");
            requestedFileTypes.add("xhtml");
        }

        MessageConverter emailParser = new MessageConverter();
        mimetype = "eml";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, emailParser);
        }
        mimetype = "msg";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, emailParser);
        }
        WebArchiveConverter webArchiveParser = new WebArchiveConverter();
        mimetype = "mht"; /* RFC822 */
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, webArchiveParser);
        }

        ImageMetadataConverter imgConv = new ImageMetadataConverter();
        String[] imageTypes = { "jpeg", "jpg" };
        for (String img : imageTypes) {
            if (requestedFileTypes.contains(img)) {
                converters.put(img, imgConv);
            }
        }

        // ALWAYS ignore our own text conversions or those of others.
        // So here all known convertable types will need a filter for their
        // conversion, e.g.,
        // pdf => ignore pdf.txt
        // doc => ignore doc.txt
        //
        for (String t : requestedFileTypes) {
            ignoreFileType(t + ".txt");
        }

        fileFilters = requestedFileTypes.toArray(new String[requestedFileTypes.size()]);
    }

    /**
     *
     */
    private String[] fileFilters = null;

    /**
     * Call after setup() has run to add all supported/requested file types
     * 
     * @return file types as a set
     */
    public Set<String> getFileTypes() {
        return requestedFileTypes;
    }

    public static void usage() {
        System.out.println();
        System.out.println("==========XText Usage=============");
        System.out.println("XText --input input  [--help] " + "\n\t[--embed-conversion | --output folder ]   "
                + "\n\t[--embed-children   | --export folder] " + "\n\t[--clean-html]   [--strip-prefix path]");
        System.out.println(" --help  print this message");
        System.out.println(" --input  where <input> is file or folder");
        System.out.println(
                " --output  where <folder> is output is a folder where you want to archive converted docs");
        System.out.println(" --embed-children embeds the saved conversions in the input folder under 'xtext/'");
        System.out.println(" --embed-conversion embeds the extracted children binaries in the input folder");
        System.out.println("     (NOT the conversions, the binaries from Archives, PST, etc)");
        System.out.println("     Default behavior is to extract originals to output archive.");
        System.out.println(" --export folder\tOpposite of -c. Extract children and save to <folder>");
        System.out.println("     NOTE: -e has same effect as setting output to input");
        System.out.println(" -clean-html enables HTML scrubbing");
        System.out.println("========================");
    }

    /**
     * Purely for logging when using the cmd line variation.
     * *
     * 
     * @author ubaldino
     *
     */
    static class MainProgramListener implements ConversionListener {

        private final Logger log = LoggerFactory.getLogger(getClass());

        @Override
        public void handleConversion(ConvertedDocument doc, String path) {
            boolean converted = false;
            if (doc != null) {
                converted = doc.is_converted;
            }
            log.info("Converted. FILE={} Status={}, Converted={}", path, doc != null, converted);
        }
    }

    public static void main(String[] args) {

        LongOpt[] options = { new LongOpt("input", LongOpt.REQUIRED_ARGUMENT, null, 'i'),
                new LongOpt("output", LongOpt.REQUIRED_ARGUMENT, null, 'o'),
                new LongOpt("export", LongOpt.REQUIRED_ARGUMENT, null, 'x'),
                new LongOpt("strip-prefix", LongOpt.REQUIRED_ARGUMENT, null, 'p'),
                new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
                new LongOpt("clean-html", LongOpt.NO_ARGUMENT, null, 'H'),
                new LongOpt("embed-conversion", LongOpt.NO_ARGUMENT, null, 'e'),
                new LongOpt("embed-children", LongOpt.NO_ARGUMENT, null, 'c'),
                new LongOpt("tika-pst", LongOpt.NO_ARGUMENT, null, 'T') };

        // "hcex:i:o:p:"
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("XText", args, "", options);

        String input = null;
        String output = null;
        boolean embed = false;
        boolean filter_html = false;
        boolean saveChildrenWithInput = false;
        String saveChildrenTo = null;
        String prefix = null;

        XText xt = new XText();

        try {
            int c;
            while ((c = opts.getopt()) != -1) {
                switch (c) {

                case 0:
                    // Long opt processed.

                    break;

                case 'i':
                    input = opts.getOptarg();
                    break;
                case 'o':
                    output = opts.getOptarg();
                    break;
                case 'H':
                    filter_html = true;
                    break;
                case 'c':
                    saveChildrenWithInput = true;
                    break;
                case 'x':
                    saveChildrenTo = opts.getOptarg();
                    break;
                case 'p':
                    prefix = opts.getOptarg();
                    break;
                case 'e':
                    embed = true;
                    System.out.println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                case 'T':
                    xt.enableTikaPST(true);
                    break;
                case 'h':
                default:
                    XText.usage();
                    System.exit(1);
                }
            }
        } catch (Exception err) {
            XText.usage();
            System.exit(1);
        }

        if (input == null) {
            System.out.println("An input argument is required, e.g., -Dinput=/Folder/...");
            System.exit(-1);
        }

        // Setting LANG=en_US in your shell.
        //
        // System.setProperty("LANG", "en_US");

        xt.enableOverwrite(true); // Given this is a test application, we will
        // overwrite every time XText is called.
        xt.enableSaving(embed || output != null);
        xt.getPathManager().enableSaveWithInput(embed); // creates a ./text/ Folder locally in
        // directory.
        xt.enableHTMLScrubber(filter_html);
        xt.getPathManager().enableSaveChildrenWithInput(saveChildrenWithInput);

        // If user wishes to strip input paths of some prefix
        // Output will be dumped in the resulting relative path.
        xt.getPathManager().setStripPrefixPath(prefix);

        // Manage the extraction of compound files -- archives, PST mailbox file, etc.
        // ... others?
        if (!saveChildrenWithInput && saveChildrenTo != null) {
            xt.getPathManager().setExtractedChildrenCache(saveChildrenTo);
        }

        try {
            if (!embed) {
                if (output == null) {
                    output = "output";
                    xt.enableSaving(true); // Will save to output dir.
                    FileUtility.makeDirectory(output);
                    xt.getPathManager().setConversionCache(output);
                    System.out.println("Default output folder is $PWD/" + output);
                } else {
                    xt.enableSaving(true);
                    // Notice this main program requires an output path.
                    xt.getPathManager().setConversionCache(output);
                }
            }
            // Set itself to listen, as this is the main program.
            xt.setConversionListener(new MainProgramListener());

            xt.setup();
            xt.extractText(input);
        } catch (IOException ioerr) {
            XText.usage();
            ioerr.printStackTrace();
        } catch (ConfigException cfgerr) {
            XText.usage();
            cfgerr.printStackTrace();
        }
    }
}