org.opensextant.examples.BasicGeoTemporalProcessing.java Source code

Introduction

Here is the source code for org.opensextant.examples.BasicGeoTemporalProcessing.java
Source

/**
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.opensextant.examples;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.io.FilenameUtils;
import org.opensextant.ConfigException;
import org.opensextant.extraction.ExtractionMetrics;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.xtemporal.XTemporal;
import org.opensextant.output.AbstractFormatter;
import org.opensextant.output.FormatterFactory;
import org.opensextant.output.ResultsFormatter;
import org.opensextant.processing.Parameters;
import org.opensextant.processing.ProcessingException;
import org.opensextant.processing.XtractorGroup;
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConversionListener;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.XText;
import org.slf4j.LoggerFactory;

/**
 * <pre>
 * A default illustration of using Xponent xtractors for geo and temporal
 * extraction.  This demo shows how to:
 *
 *  setup some extractors
 *  crawl data
 *  process data
 *  output in particular formats.
 *
 * All showing the most basic aspects of the OpenSextant and Xponents APIs
 *
 * NOTE: this is a variation on OpenSextant v1.4 "Runner" app.
 *
 *</pre>
 *
 * @author ubaldino
 */
public class BasicGeoTemporalProcessing extends XtractorGroup implements ConversionListener {

    private Parameters params = new Parameters();
    protected XText converter;
    /* # of documents */
    private int total_docs = 0;
    private long total_rawbytes = 0;
    private long total_size = 0;
    /* Process 4 MB of text content  800 x 5KB average documents */
    private ExtractionMetrics conversionMetric = new ExtractionMetrics("doc-conversion");
    private ExtractionMetrics processingMetric = new ExtractionMetrics("doc-processing");
    private boolean overwriteOutput = true;

    /**
     *
     */
    public BasicGeoTemporalProcessing() {
        log = LoggerFactory.getLogger(BasicGeoTemporalProcessing.class);
    }

    /**
     * Shutdown: release global resources, if any; Close all formatters
     *
     */
    public void shutdown() {
        //PlacenameMatcher.shutdown();
        cleanupAll();

        for (ResultsFormatter outputter : formatters) {
            outputter.finish();
        }
    }

    /**   Ideally you should separate your one-time initialization steps, configuring your extractors
     * apart from the repetitive steps of setting up Jobs and Inputs.   Outputs you might setup once
     * for the entire JVM session, or it may be something you do periodically.  In summary:
     *
     * configure separately:
     *   a) extractors, converters
     *   b) job inputs and parameters
     *   c) output formatters
     *   d) other resources, e.g., filters
     */
    public void setup(String inFile, List<String> outFormats, String outFile, String tempDir)
            throws ConfigException, ProcessingException, IOException {

        params.isdefault = false;

        if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
            throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
        }

        // If you are dead-sure you want only coordinates from text, then just use XCoord.
        // Otherwise SimpleGeocoder does both coords + names.
        // 
        //XCoord xcoord = new XCoord();
        //xcoord.configure();
        //this.addExtractor(xcoord);

        // Testing only
        params.tag_places = true;
        params.tag_coordinates = true;
        params.output_countries = false;

        PlaceGeocoder geocoder = new PlaceGeocoder();
        geocoder.enablePersonNameMatching(true);
        geocoder.setParameters(params);
        geocoder.configure();
        this.addExtractor(geocoder);

        XTemporal xtemp = new XTemporal();
        xtemp.configure();
        this.addExtractor(xtemp);

        converter = new XText();

        converter.enableHTMLScrubber(false);
        converter.enableSaving(true);
        converter.enableOverwrite(false);
        converter.setConversionListener(this);

        // Complications:  Where do we save converted items?
        // Developer should change this based on actual environment, paths, perms, etc.
        // Using a "temp" folder as XText cache or no cache at all... 
        // This is for illustration purposes only.
        // 
        if (tempDir != null) {
            converter.getPathManager().setConversionCache(tempDir);
        } else {
            converter.enableSaving(false);
        }

        try {
            converter.setup();
        } catch (IOException ioerr) {
            throw new ConfigException("Document converter could not start", ioerr);
        }

        this.params.inputFile = inFile.trim();
        this.params.outputFile = outFile.trim();

        if (outFormats != null) {
            for (String fmt : outFormats) {
                params.addOutputFormat(fmt);
                AbstractFormatter formatter = createFormatter(fmt, params);
                formatter.overwrite = overwriteOutput;
                this.addFormatter(formatter);

                //if (formatter instanceof CSVFormatter) {
                //    formatter.addField(OpenSextantSchema.FILEPATH.getName());
                //    formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
                // }
                formatter.start(params.getJobName());
            }
        }

    }

    /**
     * The default formatter
     */
    public static AbstractFormatter createFormatter(String outputFormat, Parameters plist)
            throws IOException, ProcessingException {

        if (plist.isdefault) {
            throw new ProcessingException("Caller is required to use non-default Parameters; "
                    + "\nat least set the output options, folder, jobname, etc.");
        }
        AbstractFormatter formatter = (AbstractFormatter) FormatterFactory.getInstance(outputFormat);
        if (formatter == null) {
            throw new ProcessingException("Wrong formatter?");
        }

        formatter.setParameters(plist);
        formatter.setOutputFilename(plist.getJobName() + formatter.outputExtension);

        return formatter;
    }

    /**
     * =============================================== Pipeline mechanics: track
     * # of docs, raw bytes, plain/text chars.
     * ===============================================
     */
    /**
     * Statusing metrics: # of documents processed so far.
     */
    public int getCurrentDocCount() {
        return total_docs;
    }

    /**
     * Statusing metrics: # of raw bytes processed so far.
     */
    public long getCurrentByteCount() {
        return total_rawbytes;
    }

    /**
     * Statusing metrics: # of plain text characters processed so far.
     */
    public long getCurrentTextCharCount() {
        return total_size;
    }

    /**
     * Runs OpenSextant. See the
     * <code>main</code> method for a description of the input parameters. TODO:
     * outFile is not used. It is only used as a part of global settings
     * somewhere....
     * @throws ConfigException 
     *
     */
    public void run() throws ProcessingException, IOException, ConfigException {

        printRequest();

        log.info("Starting document ingest");
        startTime = System.currentTimeMillis();
        prevTime = startTime;

        // All input and processing happens within:
        converter.extractText(this.params.inputFile);

        reportMemory();
        log.info("Finished all processing");
    }

    long startTime = 0;
    long prevTime = 0;

    /**
     * Note -- a corpus will explode in memory if the job is too large.
     * Processor design should account for how to partition the problem -
     * ingest, conversion, geocoding, persistence, output format generation.
     *
     * This implements the XText conversion listener -- when a document is found
     * it is reported here. We add it to the corpus prior to executing the
     * application on the corpus.
     *
     * The preferred mode is to take the list of document URLs and process them
     * as a batch.
     *
     */
    public void handleConversion(ConvertedDocument txtdoc, String fpath) {
        if (txtdoc == null) {
            log.error("NOTE: Document could not be converted FILE={}", fpath);
            return;
        }
        total_rawbytes += txtdoc.filesize;
        ++total_docs;
        total_size += txtdoc.buffer.length();
        long now = System.currentTimeMillis();

        conversionMetric.addTime(now - prevTime);
        prevTime = now;

        this.processAndFormat(txtdoc);
        now = System.currentTimeMillis();
        processingMetric.addTime(now - prevTime);
        prevTime = now;

        if (total_docs % 100 == 0) {
            reportMemory();
        }
    }

    public void reportMemory() {
        Runtime R = Runtime.getRuntime();
        long usedMemory = R.totalMemory() - R.freeMemory();
        log.info("CURRENT MEM USAGE(K)=" + (int) (usedMemory / 1024));
    }

    public void reportMetrics() {
        log.info("===============\nDOCUMENT CONVERSION");
        log.info("\t" + conversionMetric.toString());

        log.info("===============\nDOCUMENT PROCESSING");
        log.info("\t" + processingMetric.toString());
    }

    private static String _inFile = null;
    private static String _outFile = null;
    private static String _outFormat = null;
    private static List<String> _outFormats = null;
    private static String _tempDir = null;

    /**
     * Parse command line options.
     */
    private static void parseCommandLine(String[] args) {
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("BasicGeoTemp", args, "hi:f:o:t:");

        int c;
        while ((c = opts.getopt()) != -1) {
            switch (c) {

            // -i inputFile = path to file or directory of files to be processed
            case 'i':
                _inFile = opts.getOptarg();
                break;

            // -f outputFormat = the desired output format
            case 'f':
                _outFormat = opts.getOptarg();
                _outFormats = TextUtils.string2list(_outFormat.trim(), ",");
                break;

            // -o outputDir = the path to output file
            case 'o':
                _outFile = opts.getOptarg();
                break;

            // -t tempDir = the path to temp directory
            case 't':
                _tempDir = opts.getOptarg();
                break;
            case 'h':
            default:
                printHelp();
                System.exit(-1);
            }
        }
    }

    protected void printRequest() {
        log.info("----------------- REQUEST -----------------");
        log.info("Input file: " + params.inputFile);
        log.info("Output format: " + params.getOutputFormats());
        log.info("Output location: " + params.outputDir);
    }

    /**
     * Print a usage message
     */
    protected static void printHelp() {

        System.out.println("Options:");
        System.out.println("\t-i inputFile = path to file or directory of files to be processed");
        System.out.println("\t-f outputFormat = the desired output format");
        System.out.println("\t-o outputFile = the path to output file");
        System.out.println("\t-t tempDir = the path to the temporary storage directory");
    }

    private StringBuilder runnerMessage = new StringBuilder();

    /**
     * Check that the input parameters are valid and complete.
     *
     * @return true if parameters and defaults suffice; false otherwise.
     */
    public boolean validateParameters(String inPath, List<String> outFormats, String outPath, String tempDir,
            Parameters plist) {

        runnerMessage = new StringBuilder();

        if (outPath == null) {
            runnerMessage.append("Please specify an Output file or folder");
            return false;
        }

        inPath = inPath.trim();
        outPath = outPath.trim();

        // Make sure input file exists
        File inFile = new File(inPath);
        if (!inFile.exists()) {
            runnerMessage.append("Input file " + inPath + " does not exist");
            return false;
        }

        // Check output format
        if (outFormats != null) {
            for (String outFormat : outFormats) {
                if (!FormatterFactory.isSupported(outFormat)) {
                    runnerMessage.append("Unrecognized output format: " + outFormat);
                    return false;
                }
            }
        }

        if (inPath.startsWith("$") || outPath.startsWith("$")) {
            runnerMessage.append("Invalid input/output -- Ant style arguments are null");
            return false;
        }
        // Verify user has specified a directory for unpacking an archive

        // Get file extension
        //String ext = FilenameUtils.getExtension(inPath);

        if (FileUtility.isArchiveFile(inPath) && tempDir == null) {
            runnerMessage.append(
                    "A directory for temporary storage must be provided for unpacking Zip and other archive files");
            return false;
        }

        // Split the path name into directory and file names
        File container = new File(outPath);
        File destDir = null;
        String destFile = null;
        log.info("Working off INPUT=" + container.getAbsolutePath());

        if (container.isDirectory()) {
            destDir = container;
            try {
                // DEFAULT file name.
                plist.setJobName("OpenSextant_Output_" + Parameters.getJobTimestamp());
            } catch (Exception fmterr) {
                runnerMessage.append("Failed to invoke the requested format to create a default output file");
                return false;
            }
        } else {
            destDir = container.getParentFile();
            if (destDir == null) {
                destDir = new File(".");
                log.info("Saving output to current working directory");
            }
            destFile = container.getName();
            plist.setJobName(FilenameUtils.getBaseName(destFile));
        }

        if (!destDir.exists()) {
            // throw new IOException("Sorry - your destination folder " + destDir + " must exist");
            runnerMessage.append("Destination folder must exist, DIR=" + destDir.getAbsolutePath());
            return false;
        }

        plist.outputDir = destDir.getAbsolutePath();

        return true;
    }

    /**
     * Runs Xponent Example from the command line. Command line options are:
     * <ul>
     * <li>
     * <code>-i </code><i>inputFile</i> Path to file or directory of files to be
     * processed
     * </li><li>
     * <code>-f </code><i>outputFormat</i> The desired output format
     * </li><li>
     * <code>-o </code><i>outputDir</i> Path to output file
     * </li><li>
     * <code>-t </code><i>tempDir</i> Path to the temporary storage directory,
     * if one is required
     * </li><li>
     * <code>-d </code><i>descriptionType</i> Choice of text string used to fill
     * description fields, if the output format has a description field.
     * </li>
     * </ul><p>
     */
    public static void main(String[] args) {

        System.out.println("Parsing Commandline");
        parseCommandLine(args);
        try {
            BasicGeoTemporalProcessing runner = new BasicGeoTemporalProcessing();

            runner.setup(_inFile, _outFormats, _outFile, _tempDir);
            runner.run();
            runner.shutdown();
            // Success.
        } catch (Exception err) {
            err.printStackTrace();
        }
        System.exit(0);
    }
}