Java tutorial
/** * Copyright 2009-2013 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** * */ package org.opensextant.examples; import java.io.File; import java.io.IOException; import java.util.List; import org.apache.commons.io.FilenameUtils; import org.opensextant.ConfigException; import org.opensextant.extraction.ExtractionMetrics; import org.opensextant.extractors.geo.PlaceGeocoder; import org.opensextant.extractors.xtemporal.XTemporal; import org.opensextant.output.AbstractFormatter; import org.opensextant.output.FormatterFactory; import org.opensextant.output.ResultsFormatter; import org.opensextant.processing.Parameters; import org.opensextant.processing.ProcessingException; import org.opensextant.processing.XtractorGroup; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; import org.opensextant.xtext.ConversionListener; import org.opensextant.xtext.ConvertedDocument; import org.opensextant.xtext.XText; import org.slf4j.LoggerFactory; /** * <pre> * A default illustration of using Xponent xtractors for geo and temporal * extraction. This demo shows how to: * * setup some extractors * crawl data * process data * output in particular formats. * * All showing the most basic aspects of the OpenSextant and Xponents APIs * * NOTE: this is a variation on OpenSextant v1.4 "Runner" app. * *</pre> * * @author ubaldino */ public class BasicGeoTemporalProcessing extends XtractorGroup implements ConversionListener { private Parameters params = new Parameters(); protected XText converter; /* # of documents */ private int total_docs = 0; private long total_rawbytes = 0; private long total_size = 0; /* Process 4 MB of text content 800 x 5KB average documents */ private ExtractionMetrics conversionMetric = new ExtractionMetrics("doc-conversion"); private ExtractionMetrics processingMetric = new ExtractionMetrics("doc-processing"); private boolean overwriteOutput = true; /** * */ public BasicGeoTemporalProcessing() { log = LoggerFactory.getLogger(BasicGeoTemporalProcessing.class); } /** * Shutdown: release global resources, if any; Close all formatters * */ public void shutdown() { //PlacenameMatcher.shutdown(); cleanupAll(); for (ResultsFormatter outputter : formatters) { outputter.finish(); } } /** Ideally you should separate your one-time initialization steps, configuring your extractors * apart from the repetitive steps of setting up Jobs and Inputs. Outputs you might setup once * for the entire JVM session, or it may be something you do periodically. In summary: * * configure separately: * a) extractors, converters * b) job inputs and parameters * c) output formatters * d) other resources, e.g., filters */ public void setup(String inFile, List<String> outFormats, String outFile, String tempDir) throws ConfigException, ProcessingException, IOException { params.isdefault = false; if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) { throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString()); } // If you are dead-sure you want only coordinates from text, then just use XCoord. // Otherwise SimpleGeocoder does both coords + names. // //XCoord xcoord = new XCoord(); //xcoord.configure(); //this.addExtractor(xcoord); // Testing only params.tag_places = true; params.tag_coordinates = true; params.output_countries = false; PlaceGeocoder geocoder = new PlaceGeocoder(); geocoder.enablePersonNameMatching(true); geocoder.setParameters(params); geocoder.configure(); this.addExtractor(geocoder); XTemporal xtemp = new XTemporal(); xtemp.configure(); this.addExtractor(xtemp); converter = new XText(); converter.enableHTMLScrubber(false); converter.enableSaving(true); converter.enableOverwrite(false); converter.setConversionListener(this); // Complications: Where do we save converted items? // Developer should change this based on actual environment, paths, perms, etc. // Using a "temp" folder as XText cache or no cache at all... // This is for illustration purposes only. // if (tempDir != null) { converter.getPathManager().setConversionCache(tempDir); } else { converter.enableSaving(false); } try { converter.setup(); } catch (IOException ioerr) { throw new ConfigException("Document converter could not start", ioerr); } this.params.inputFile = inFile.trim(); this.params.outputFile = outFile.trim(); if (outFormats != null) { for (String fmt : outFormats) { params.addOutputFormat(fmt); AbstractFormatter formatter = createFormatter(fmt, params); formatter.overwrite = overwriteOutput; this.addFormatter(formatter); //if (formatter instanceof CSVFormatter) { // formatter.addField(OpenSextantSchema.FILEPATH.getName()); // formatter.addField(OpenSextantSchema.MATCH_TEXT.getName()); // } formatter.start(params.getJobName()); } } } /** * The default formatter */ public static AbstractFormatter createFormatter(String outputFormat, Parameters plist) throws IOException, ProcessingException { if (plist.isdefault) { throw new ProcessingException("Caller is required to use non-default Parameters; " + "\nat least set the output options, folder, jobname, etc."); } AbstractFormatter formatter = (AbstractFormatter) FormatterFactory.getInstance(outputFormat); if (formatter == null) { throw new ProcessingException("Wrong formatter?"); } formatter.setParameters(plist); formatter.setOutputFilename(plist.getJobName() + formatter.outputExtension); return formatter; } /** * =============================================== Pipeline mechanics: track * # of docs, raw bytes, plain/text chars. * =============================================== */ /** * Statusing metrics: # of documents processed so far. */ public int getCurrentDocCount() { return total_docs; } /** * Statusing metrics: # of raw bytes processed so far. */ public long getCurrentByteCount() { return total_rawbytes; } /** * Statusing metrics: # of plain text characters processed so far. */ public long getCurrentTextCharCount() { return total_size; } /** * Runs OpenSextant. See the * <code>main</code> method for a description of the input parameters. TODO: * outFile is not used. It is only used as a part of global settings * somewhere.... * @throws ConfigException * */ public void run() throws ProcessingException, IOException, ConfigException { printRequest(); log.info("Starting document ingest"); startTime = System.currentTimeMillis(); prevTime = startTime; // All input and processing happens within: converter.extractText(this.params.inputFile); reportMemory(); log.info("Finished all processing"); } long startTime = 0; long prevTime = 0; /** * Note -- a corpus will explode in memory if the job is too large. * Processor design should account for how to partition the problem - * ingest, conversion, geocoding, persistence, output format generation. * * This implements the XText conversion listener -- when a document is found * it is reported here. We add it to the corpus prior to executing the * application on the corpus. * * The preferred mode is to take the list of document URLs and process them * as a batch. * */ public void handleConversion(ConvertedDocument txtdoc, String fpath) { if (txtdoc == null) { log.error("NOTE: Document could not be converted FILE={}", fpath); return; } total_rawbytes += txtdoc.filesize; ++total_docs; total_size += txtdoc.buffer.length(); long now = System.currentTimeMillis(); conversionMetric.addTime(now - prevTime); prevTime = now; this.processAndFormat(txtdoc); now = System.currentTimeMillis(); processingMetric.addTime(now - prevTime); prevTime = now; if (total_docs % 100 == 0) { reportMemory(); } } public void reportMemory() { Runtime R = Runtime.getRuntime(); long usedMemory = R.totalMemory() - R.freeMemory(); log.info("CURRENT MEM USAGE(K)=" + (int) (usedMemory / 1024)); } public void reportMetrics() { log.info("===============\nDOCUMENT CONVERSION"); log.info("\t" + conversionMetric.toString()); log.info("===============\nDOCUMENT PROCESSING"); log.info("\t" + processingMetric.toString()); } private static String _inFile = null; private static String _outFile = null; private static String _outFormat = null; private static List<String> _outFormats = null; private static String _tempDir = null; /** * Parse command line options. */ private static void parseCommandLine(String[] args) { gnu.getopt.Getopt opts = new gnu.getopt.Getopt("BasicGeoTemp", args, "hi:f:o:t:"); int c; while ((c = opts.getopt()) != -1) { switch (c) { // -i inputFile = path to file or directory of files to be processed case 'i': _inFile = opts.getOptarg(); break; // -f outputFormat = the desired output format case 'f': _outFormat = opts.getOptarg(); _outFormats = TextUtils.string2list(_outFormat.trim(), ","); break; // -o outputDir = the path to output file case 'o': _outFile = opts.getOptarg(); break; // -t tempDir = the path to temp directory case 't': _tempDir = opts.getOptarg(); break; case 'h': default: printHelp(); System.exit(-1); } } } protected void printRequest() { log.info("----------------- REQUEST -----------------"); log.info("Input file: " + params.inputFile); log.info("Output format: " + params.getOutputFormats()); log.info("Output location: " + params.outputDir); } /** * Print a usage message */ protected static void printHelp() { System.out.println("Options:"); System.out.println("\t-i inputFile = path to file or directory of files to be processed"); System.out.println("\t-f outputFormat = the desired output format"); System.out.println("\t-o outputFile = the path to output file"); System.out.println("\t-t tempDir = the path to the temporary storage directory"); } private StringBuilder runnerMessage = new StringBuilder(); /** * Check that the input parameters are valid and complete. * * @return true if parameters and defaults suffice; false otherwise. */ public boolean validateParameters(String inPath, List<String> outFormats, String outPath, String tempDir, Parameters plist) { runnerMessage = new StringBuilder(); if (outPath == null) { runnerMessage.append("Please specify an Output file or folder"); return false; } inPath = inPath.trim(); outPath = outPath.trim(); // Make sure input file exists File inFile = new File(inPath); if (!inFile.exists()) { runnerMessage.append("Input file " + inPath + " does not exist"); return false; } // Check output format if (outFormats != null) { for (String outFormat : outFormats) { if (!FormatterFactory.isSupported(outFormat)) { runnerMessage.append("Unrecognized output format: " + outFormat); return false; } } } if (inPath.startsWith("$") || outPath.startsWith("$")) { runnerMessage.append("Invalid input/output -- Ant style arguments are null"); return false; } // Verify user has specified a directory for unpacking an archive // Get file extension //String ext = FilenameUtils.getExtension(inPath); if (FileUtility.isArchiveFile(inPath) && tempDir == null) { runnerMessage.append( "A directory for temporary storage must be provided for unpacking Zip and other archive files"); return false; } // Split the path name into directory and file names File container = new File(outPath); File destDir = null; String destFile = null; log.info("Working off INPUT=" + container.getAbsolutePath()); if (container.isDirectory()) { destDir = container; try { // DEFAULT file name. plist.setJobName("OpenSextant_Output_" + Parameters.getJobTimestamp()); } catch (Exception fmterr) { runnerMessage.append("Failed to invoke the requested format to create a default output file"); return false; } } else { destDir = container.getParentFile(); if (destDir == null) { destDir = new File("."); log.info("Saving output to current working directory"); } destFile = container.getName(); plist.setJobName(FilenameUtils.getBaseName(destFile)); } if (!destDir.exists()) { // throw new IOException("Sorry - your destination folder " + destDir + " must exist"); runnerMessage.append("Destination folder must exist, DIR=" + destDir.getAbsolutePath()); return false; } plist.outputDir = destDir.getAbsolutePath(); return true; } /** * Runs Xponent Example from the command line. Command line options are: * <ul> * <li> * <code>-i </code><i>inputFile</i> Path to file or directory of files to be * processed * </li><li> * <code>-f </code><i>outputFormat</i> The desired output format * </li><li> * <code>-o </code><i>outputDir</i> Path to output file * </li><li> * <code>-t </code><i>tempDir</i> Path to the temporary storage directory, * if one is required * </li><li> * <code>-d </code><i>descriptionType</i> Choice of text string used to fill * description fields, if the output format has a description field. * </li> * </ul><p> */ public static void main(String[] args) { System.out.println("Parsing Commandline"); parseCommandLine(args); try { BasicGeoTemporalProcessing runner = new BasicGeoTemporalProcessing(); runner.setup(_inFile, _outFormats, _outFile, _tempDir); runner.run(); runner.shutdown(); // Success. } catch (Exception err) { err.printStackTrace(); } System.exit(0); } }