eu.interedition.collatex.cli.Engine.java Source code

Java tutorial

Introduction

Here is the source code for eu.interedition.collatex.cli.Engine.java

Source

/*
 * Copyright (c) 2013 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see <http://www.gnu.org/licenses/>.
 */

package eu.interedition.collatex.cli;

import com.google.common.base.Function;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.google.common.io.Closer;
import com.google.common.io.Files;
import eu.interedition.collatex.CollationAlgorithm;
import eu.interedition.collatex.CollationAlgorithmFactory;
import eu.interedition.collatex.Token;
import eu.interedition.collatex.VariantGraph;
import eu.interedition.collatex.http.JsonProcessor;
import eu.interedition.collatex.jung.JungVariantGraph;
import eu.interedition.collatex.matching.EqualityTokenComparator;
import eu.interedition.collatex.simple.SimpleCollation;
import eu.interedition.collatex.simple.SimplePatternTokenizer;
import eu.interedition.collatex.simple.SimpleToken;
import eu.interedition.collatex.simple.SimpleTokenNormalizers;
import eu.interedition.collatex.simple.SimpleVariantGraphSerializer;
import eu.interedition.collatex.simple.SimpleWitness;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.xml.sax.SAXException;

import javax.script.ScriptException;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Comparator;
import java.util.List;

/**
 * @author <a href="http://gregor.middell.net/" title="Homepage">Gregor Middell</a>
 */
public class Engine implements Closeable {

    Charset inputCharset;
    boolean xmlMode;
    List<URL> inputResources;
    List<SimpleWitness> witnesses;
    XPathExpression tokenXPath;

    Function<String, Iterable<String>> tokenizer;
    Function<String, String> normalizer;
    Comparator<Token> comparator;
    CollationAlgorithm collationAlgorithm;
    VariantGraph variantGraph;
    boolean joined = false;

    String outputFormat;
    Charset outputCharset;
    PrintWriter out;
    File outFile = null;
    PrintWriter log = new PrintWriter(System.err);
    boolean errorOccurred = false;

    Engine configure(CommandLine commandLine)
            throws XPathExpressionException, ParseException, ScriptException, IOException {
        this.inputCharset = Charset.forName(commandLine.getOptionValue("ie", "UTF-8"));
        this.xmlMode = commandLine.hasOption("xml");
        this.tokenXPath = XPathFactory.newInstance().newXPath()
                .compile(commandLine.getOptionValue("xp", "//text()"));

        final String script = commandLine.getOptionValue("s");
        try {
            final PluginScript pluginScript = (script == null
                    ? PluginScript.read("<internal>", new StringReader(""))
                    : PluginScript.read(argumentToResource(script)));

            this.tokenizer = Objects.firstNonNull(pluginScript.tokenizer(), SimplePatternTokenizer.BY_WS_OR_PUNCT);
            this.normalizer = Objects.firstNonNull(pluginScript.normalizer(), SimpleTokenNormalizers.LC_TRIM_WS);
            this.comparator = Objects.firstNonNull(pluginScript.comparator(), new EqualityTokenComparator());
        } catch (IOException e) {
            throw new ParseException("Failed to read script '" + script + "' - " + e.getMessage());
        }

        final String algorithm = commandLine.getOptionValue("a", "dekker").toLowerCase();
        switch (algorithm) {
        case "needleman-wunsch":
            this.collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(this.comparator);
            break;
        case "medite":
            this.collationAlgorithm = CollationAlgorithmFactory.medite(this.comparator,
                    SimpleToken.TOKEN_MATCH_EVALUATOR);
            break;
        case "gst":
            this.collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(comparator, 2);
            break;
        default:
            this.collationAlgorithm = CollationAlgorithmFactory.dekker(this.comparator);
            break;
        }

        this.variantGraph = new JungVariantGraph();

        this.joined = !commandLine.hasOption("t");

        this.outputFormat = commandLine.getOptionValue("f", "json").toLowerCase();

        outputCharset = Charset.forName(commandLine.getOptionValue("oe", "UTF-8"));
        final String output = commandLine.getOptionValue("o", "-");
        if (!"-".equals(output)) {
            try {
                this.outFile = new File(output);
                this.out = new PrintWriter(Files.newWriter(this.outFile, outputCharset));
            } catch (FileNotFoundException e) {
                throw new ParseException("Output file '" + outFile + "' not found");
            }
        } else {
            this.out = new PrintWriter(new OutputStreamWriter(System.out, outputCharset));
        }

        final String[] witnessSpecs = commandLine.getArgs();
        this.inputResources = Lists.newArrayListWithExpectedSize(witnessSpecs.length);
        for (String witnessSpec : witnessSpecs) {
            inputResources.add(argumentToResource(witnessSpec));
        }
        if (inputResources.size() < 1) {
            throw new ParseException("No input resource(s) given");
        }

        return this;
    }

    Engine read() throws IOException, XPathExpressionException, SAXException {
        if (inputResources.size() < 2) {
            try (InputStream inputStream = inputResources.get(0).openStream()) {
                this.witnesses = JsonProcessor.read(inputStream).getWitnesses();
            }
        } else {
            this.witnesses = Lists.newArrayListWithExpectedSize(inputResources.size());
            //noinspection Convert2streamapi
            for (URL witnessURL : inputResources) {
                this.witnesses.add(new URLWitness("w" + (witnesses.size() + 1), witnessURL).read(tokenizer,
                        normalizer, inputCharset, (xmlMode ? tokenXPath : null)));
            }
        }
        return this;
    }

    Engine collate() {
        new SimpleCollation(witnesses, collationAlgorithm, joined).collate(variantGraph);
        return this;
    }

    void write() throws IOException {
        final SimpleVariantGraphSerializer serializer = new SimpleVariantGraphSerializer(variantGraph);
        if ("csv".equals(outputFormat)) {
            serializer.toCsv(out);
        } else if ("dot".equals(outputFormat)) {
            serializer.toDot(out);
        } else if ("graphml".equals(outputFormat) || "tei".equals(outputFormat)) {
            XMLStreamWriter xml = null;
            try {
                xml = XMLOutputFactory.newInstance().createXMLStreamWriter(out);
                xml.writeStartDocument(outputCharset.name(), "1.0");
                if ("graphml".equals(outputFormat)) {
                    serializer.toGraphML(xml);
                } else {
                    serializer.toTEI(xml);
                }
                xml.writeEndDocument();
            } catch (XMLStreamException e) {
                throw new IOException(e);
            } finally {
                if (xml != null) {
                    try {
                        xml.close();
                    } catch (XMLStreamException e) {
                        throw new IOException(e);
                    }
                }
            }
        } else {
            JsonProcessor.write(variantGraph, out);
        }
    }

    Engine log(String str) {
        log.write(str);
        return this;
    }

    void error(String str, Throwable t) {
        errorOccurred = true;
        log(str).log("\n").log(t.getMessage()).log("\n");
    }

    void help() {
        new HelpFormatter().printHelp(log, 78,
                "collatex [<options>]\n (<json_input> | <witness_1> <witness_2> [[<witness_3>] ...])", "", OPTIONS,
                2, 4, "");
    }

    URL argumentToResource(String arg) throws ParseException {
        try {
            final File witnessFile = new File(arg);
            if (witnessFile.exists()) {
                return witnessFile.toURI().normalize().toURL();
            } else {
                return new URL(arg);
            }
        } catch (MalformedURLException urlEx) {
            throw new ParseException("Invalid resource: " + arg);
        }
    }

    public static void main(String... args) {
        final Engine engine = new Engine();
        try {
            final CommandLine commandLine = new GnuParser().parse(OPTIONS, args);
            if (commandLine.hasOption("h")) {
                engine.help();
                return;
            }
            engine.configure(commandLine).read().collate().write();
        } catch (ParseException e) {
            engine.error("Error while parsing command line arguments", e);
            engine.log("\n").help();
        } catch (IllegalArgumentException e) {
            engine.error("Illegal argument", e);
        } catch (IOException e) {
            engine.error("I/O error", e);
        } catch (SAXException e) {
            engine.error("XML error", e);
        } catch (XPathExpressionException e) {
            engine.error("XPath error", e);
        } catch (ScriptException | PluginScript.PluginScriptExecutionException e) {
            engine.error("Script error", e);
        } finally {
            try {
                Closeables.close(engine, false);
            } catch (IOException e) {
            }
        }
    }

    static final Options OPTIONS = new Options();

    static {
        OPTIONS.addOption("h", "help", false, "print usage instructions (which your are looking at right now)");
        OPTIONS.addOption("o", "output", true, "output file; '-' for standard output (default)");
        OPTIONS.addOption("ie", "input-encoding", true,
                "charset to use for decoding non-XML witnesses; default: UTF-8");
        OPTIONS.addOption("oe", "output-encoding", true, "charset to use for encoding the output; default: UTF-8");
        OPTIONS.addOption("xml", "xml-mode", false, "witnesses are treated as XML documents");
        OPTIONS.addOption("xp", "xpath", true,
                "XPath 1.0 expression evaluating to tokens of XML witnesses; default: '//text()'");
        OPTIONS.addOption("a", "algorithm", true,
                "progressive alignment algorithm to use 'dekker' (default), 'medite', 'needleman-wunsch'");
        OPTIONS.addOption("t", "tokenized", false,
                "consecutive matches of tokens will *not* be joined to segments");
        OPTIONS.addOption("f", "format", true, "result/output format: 'json', 'csv', 'dot', 'graphml', 'tei'");
        OPTIONS.addOption("s", "script", true,
                "ECMA/JavaScript resource with functions to be plugged into the alignment algorithm");
    }

    @Override
    public void close() throws IOException {
        final Closer closer = Closer.create();
        try {
            if (out != null) {
                closer.register(out).flush();
            }
            if (log != null) {
                closer.register(log).flush();
            }
        } finally {
            closer.close();
        }
        if (errorOccurred && (outFile != null) && outFile.isFile()) {
            //noinspection ResultOfMethodCallIgnored
            outFile.delete();
        }
    }
}