TmlCommandLine.java Source code

Introduction

Here is the source code for TmlCommandLine.java
Source

/*******************************************************************************
 *  Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0 
 *     
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import tml.Configuration;
import tml.annotators.Annotator;
import tml.corpus.CorpusParameters;
import tml.corpus.SearchResultsCorpus;
import tml.corpus.TextDocument;
import tml.corpus.CorpusParameters.DimensionalityReduction;
import tml.corpus.CorpusParameters.TermSelection;
import tml.storage.Repository;
import tml.vectorspace.TermWeighting.GlobalWeight;
import tml.vectorspace.TermWeighting.LocalWeight;
import tml.vectorspace.operations.Operation;

/**
 * Command line interface for TML, this is probably the easiest way to access it.
 * 
 * Intended use should be:
 * usage: tml <options> [parameters] operation
 *  -I                             Insert documents into repository.
 *     --iannotators <arg>         List of annotators to use when inserting
 *                                 the documents. (e.g. PennTreeAnnotator).
 *     --iclean                    Empties the repository before inserting
 *                                 new ones.
 *     --idocs <folder>            The folder that contains the documens to
 *                                 insert.
 *     --imaxdocs <number>         Maximum number of documents to index or
 *                                 use in an operation.
 *  -O                             Performs an operation on a corpus.
 *     --oalldocs <type>           Use all documents in repository as single
 *                                 document corpora, it can be sentence or paragraph based. (e.g. sentence).
 *     --obk <query>               Lucene query that defines a background
 *                                 knowledge on which the corpus will be projected. (e.g. "type:sentences AND
 *                                 reference:Document*").
 *     --obkpar <parameter file>   Properties file with the background
 *                                 knowledge corpus parameters, if not set it will use the same as the
 *                                 corpus.
 *     --ocorpus <query>           Lucene query that defines the corpus to
 *                                 operate with. (e.g. "type:sentence AND reference:Document01").
 *     --ocpar <parameter file>    Properties file with the corpus parameters
 *                                 (optional).
 *     --odim <list>               Name of the Dimensionality Reduction
 *                                 criteria. (e.g. VARPCT,NUM,PCT,NO).
 *     --odimth <list>             Threshold for the dim options. (e.g.
 *                                 0,1,2).
 *     --olanczos                  Use Lanczos for SVD decomposition.
 *     --operations <list>         The list of operations you want to execute
 *                                 on the corpus. (e.g. PassageDistances,PassageSimilarity .
 *     --oresults <folder>         Folder where to store the results. (e.g.
 *                                 results/run01/).
 *     --otsel <name>              Name of the Term selection criteria
 *                                 (TF,AVG_TF,DF).
 *     --otselth <number>          Threshold for the tsel criteria option.
 *     --otwg <list>               Name of the Global Weight to apply. (e.g.
 *                                 None,Normal,GfIdf,Idf,Entropy).
 *     --otwl <list>               Name of the Local Weight to apply.
 *                                 (e.g.Binary,TF,TFn,LOGTF).
 *  -repo <folder>                 Full path of the repository folder, where
 *                                 TML will retrieve (or insert) documents. (e.g. /home/user/lucene).
 * 
 * @author Jorge Villalon
 *
 */
public class TmlCommandLine {

    private static Logger logger = Logger.getLogger(TmlCommandLine.class);
    private static Repository repository = null;
    private static CommandLine line = null;
    private static Options options = null;
    private static String repositoryFolder = null;

    @SuppressWarnings("static-access")
    public static void main(String[] args) {
        long time = System.nanoTime();

        options = new Options();

        // Repository
        options.addOption(OptionBuilder.withDescription(
                "Full path of the repository folder, where TML will retrieve (or insert) documents. (e.g. /home/user/lucene).")
                .hasArg().withArgName("folder").isRequired().create("repo"));

        // Verbosity
        options.addOption(
                OptionBuilder.withDescription("Verbose output in the console (it goes verbose to the log file).")
                        .hasArg(false).isRequired(false).create("v"));

        // Operation on corpus
        options.addOption(OptionBuilder.hasArg(false).withDescription("Performs an operation on a corpus.")
                .isRequired(false).create("O"));

        // The list of operations
        options.addOption(OptionBuilder.withDescription(
                "The list of operations you want to execute on the corpus. (e.g. PassageDistances,PassageSimilarity .")
                .hasArgs().withValueSeparator(',').withArgName("list").isRequired(false).withLongOpt("operations")
                .create());

        // The file to store the results
        options.addOption(OptionBuilder.withDescription("Folder where to store the results. (e.g. results/run01/).")
                .hasArg().withArgName("folder").isRequired(false).withLongOpt("oresults").create());

        // The corpus on which operate
        options.addOption(OptionBuilder.withDescription(
                "Lucene query that defines the corpus to operate with. (e.g. \"type:sentence AND reference:Document01\").")
                .hasArg().withArgName("query").isRequired(false).withLongOpt("ocorpus").create());

        // The corpus on which operate
        options.addOption(OptionBuilder.withDescription(
                "Use all documents in repository as single document corpora, it can be sentence or paragraph based. (e.g. sentence).")
                .hasArgs().withArgName("type").isRequired(false).withLongOpt("oalldocs").create());

        // The properties file for the corpus
        options.addOption(OptionBuilder.withDescription("Properties file with the corpus parameters (optional).")
                .hasArg().withArgName("parameter file").isRequired(false).withLongOpt("ocpar").create());

        // Background knowledge corpus
        options.addOption(OptionBuilder.withDescription(
                "Lucene query that defines a background knowledge on which the corpus will be projected. (e.g. \"type:sentences AND reference:Document*\").")
                .hasArg().withArgName("query").isRequired(false).withLongOpt("obk").create());

        // Background knowledge parameters
        options.addOption(OptionBuilder.withDescription(
                "Properties file with the background knowledge corpus parameters, if not set it will use the same as the corpus.")
                .hasArg().withArgName("parameter file").isRequired(false).withLongOpt("obkpar").create());

        // Term selection
        String criteria = "";
        for (TermSelection tsel : TermSelection.values()) {
            criteria += "," + tsel.name();
        }
        criteria = criteria.substring(1);
        options.addOption(OptionBuilder.hasArgs().withArgName("name")
                .withDescription("Name of the Term selection criteria (" + criteria + ").").isRequired(false)
                .withValueSeparator(',').withLongOpt("otsel").create());

        //   Term selection threshold
        options.addOption(OptionBuilder.hasArgs().withArgName("number")
                .withDescription("Threshold for the tsel criteria option.").withType(Integer.TYPE).isRequired(false)
                .withValueSeparator(',').withLongOpt("otselth").create());

        //   Dimensionality reduction
        criteria = "";
        for (DimensionalityReduction dim : DimensionalityReduction.values()) {
            criteria += "," + dim.name();
        }
        criteria = criteria.substring(1);
        options.addOption(OptionBuilder.hasArgs().withArgName("list")
                .withDescription("Name of the Dimensionality Reduction criteria. (e.g. " + criteria + ").")
                .isRequired(false).withValueSeparator(',').withLongOpt("odim").create());

        //   Dimensionality reduction threshold
        options.addOption(OptionBuilder.hasArgs().withArgName("list")
                .withDescription("Threshold for the dim options. (e.g. 0,1,2).").isRequired(false)
                .withValueSeparator(',').withLongOpt("odimth").create());

        //   Local weight
        criteria = "";
        for (LocalWeight weight : LocalWeight.values()) {
            criteria += "," + weight.name();
        }
        criteria = criteria.substring(1);
        options.addOption(OptionBuilder.hasArgs().withArgName("list")
                .withDescription("Name of the Local Weight to apply. (e.g." + criteria + ").").isRequired(false)
                .withValueSeparator(',').withLongOpt("otwl").create());

        //   Global weight
        criteria = "";
        for (GlobalWeight weight : GlobalWeight.values()) {
            criteria += "," + weight.name();
        }
        criteria = criteria.substring(1);
        options.addOption(OptionBuilder.hasArgs().withArgName("list")
                .withDescription("Name of the Global Weight to apply. (e.g. " + criteria + ").").isRequired(false)
                .withValueSeparator(',').withLongOpt("otwg").create());

        //   Use Lanczos
        options.addOption(OptionBuilder.hasArg(false).withDescription("Use Lanczos for SVD decomposition.")
                .isRequired(false).withLongOpt("olanczos").create());

        // Inserting documents in repository
        options.addOption(OptionBuilder.hasArg(false).withDescription("Insert documents into repository.")
                .isRequired(false).create("I"));

        // Max documents to insert
        options.addOption(OptionBuilder.hasArg().withArgName("number")
                .withDescription("Maximum number of documents to index or use in an operation.")
                .withType(Integer.TYPE).isRequired(false).withLongOpt("imaxdocs").create());

        // Clean repository
        options.addOption(
                OptionBuilder.hasArg(false).withDescription("Empties the repository before inserting new ones.")
                        .isRequired(false).withLongOpt("iclean").create());

        // Use annotator
        options.addOption(OptionBuilder.hasArgs()
                .withDescription(
                        "List of annotators to use when inserting the documents. (e.g. PennTreeAnnotator).")
                .isRequired(false).withValueSeparator(',').withLongOpt("iannotators").create());

        // Documents folder
        options.addOption(OptionBuilder.hasArg().withArgName("folder")
                .withDescription("The folder that contains the documens to insert.").isRequired(false)
                .withLongOpt("idocs").create());

        // Initializing the line parser
        CommandLineParser parser = new PosixParser();
        try {
            line = parser.parse(options, args);
        } catch (ParseException e) {
            printHelp(options);
            return;
        }

        // Validate that either inserting or an operation are given
        if (!line.hasOption("I") && !line.hasOption("O")) {
            System.out.println("One of the options -I or -O must be present.");
            printHelp(options);
            return;
        }

        repositoryFolder = line.getOptionValue("repo");

        try {
            if (line.hasOption("I")) {
                indexing();
            } else if (line.hasOption("O")) {
                operation();
            }
        } catch (ParseException e) {
            System.out.println(e.getMessage());
            printHelp(options);
            return;
        }

        System.out.println("TML finished successfully in " + (System.nanoTime() - time) * 10E-9 + " seconds.");
        return;
    }

    private static void printHelp(Options options) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("tml <options> [parameters] operation", options);
    }

    @SuppressWarnings("rawtypes")
    private static void indexing() throws ParseException {

        if (!line.hasOption("idocs")) {
            throw new ParseException("Indexing requires the idocs option.");
        }

        if (!startTML()) {
            throw new ParseException("Fatal error initializing TML.");
        }

        if (line.hasOption("iclean")) {
            try {
                Repository.cleanStorage(repositoryFolder);
            } catch (Exception e) {
                logger.error(e);
                return;
            }
        }

        try {
            repository = new Repository(repositoryFolder);
        } catch (Exception e) {
            e.printStackTrace();
            logger.error(e);
            return;
        }

        // Remove all annotators because in command line mode they must be added one by one
        for (int i = repository.getAnnotators().size() - 1; i >= 0; i--) {
            Annotator annotator = repository.getAnnotators().get(i);
            repository.removeAnnotator(annotator);
        }

        String[] annotatorsList = line.getOptionValues("iannotators");
        if (annotatorsList != null && annotatorsList.length > 0) {
            for (String annotatorName : annotatorsList) {
                Class classDefinition = null;
                Annotator annotator = null;
                try {
                    classDefinition = Class.forName("tml.annotators." + annotatorName);
                    annotator = (Annotator) classDefinition.newInstance();
                } catch (Exception e) {
                    logger.error("The annotator wasn't found! " + annotatorName);
                    logger.error(e);
                    continue;
                }

                repository.addAnnotator(annotator);
            }
        }

        String documentsFolder = line.getOptionValue("idocs");
        try {
            if (line.hasOption("imaxdocs")) {
                int maxDocs = Integer.parseInt(line.getOptionValue("imaxdocs"));
                repository.addDocumentsInFolder(documentsFolder, maxDocs);
            } else {
                repository.addDocumentsInFolder(documentsFolder);
            }
        } catch (IOException e) {
            logger.error(e);
            return;
        }
    }

    @SuppressWarnings("rawtypes")
    private static void operation() throws ParseException {

        if (line.hasOption("ocorpus") && (line.getOptionValue("ocorpus") == null
                || line.getOptionValue("ocorpus").trim().length() == 0)) {
            throw new ParseException("Invalid ocorpus option argument value.");
        }

        String allDocsCorpusType = line.getOptionValue("oalldocs");
        if (line.hasOption("oalldocs") && !allDocsCorpusType.equals("sentence")
                && !allDocsCorpusType.equals("paragraph")) {
            throw new ParseException("Invalid oalldocs option argument value.");
        }

        String[] operations = line.getOptionValues("operations");
        if (operations == null || operations.length == 0) {
            throw new ParseException("You must specify at least one operation!");
        }

        if (!startTML()) {
            throw new ParseException("Fatal error initializing TML.");
        }

        try {
            repository = new Repository(repositoryFolder);
        } catch (Exception e) {
            logger.error(e);
            return;
        }

        String[] corpusQueries = null;
        if (line.hasOption("ocorpus")) {
            corpusQueries = new String[1];
            corpusQueries[0] = line.getOptionValue("ocorpus");
        } else if (line.hasOption("oalldocs")) {
            List<TextDocument> docs = null;
            try {
                docs = repository.getAllTextDocuments();
            } catch (Exception e) {
                logger.fatal("Couldn't get list of documents from repository.");
                throw new ParseException(e.getMessage());
            }
            corpusQueries = new String[docs.size()];
            for (int i = 0; i < docs.size(); i++) {
                TextDocument doc = docs.get(i);
                String referenceId = null;
                if (line.getOptionValue("oalldocs").equals("sentence"))
                    referenceId = "p*d" + doc.getExternalId();
                else
                    referenceId = doc.getExternalId();
                corpusQueries[i] = "type:" + line.getOptionValue("oalldocs") + " AND reference:" + referenceId;
            }
        }

        String corpusLine = "NoCorpus";
        if (line.hasOption("ocorpus"))
            corpusLine = line.getOptionValue("ocorpus").replaceAll("\\W", "");
        else if (line.hasOption("oalldocs"))
            corpusLine = "AllDocuments";
        String resultsFilename = repository.getIndexPath().substring(1).replaceAll("[/\\\\]", "_") + "."
                + corpusLine + "." + (new SimpleDateFormat("yyyy-MM-dd-hh-mm")).format(new Date()) + ".txt";

        // Initialize arrays and set default parameters
        DimensionalityReduction[] dims = new DimensionalityReduction[1];
        double[] dimths = new double[1];
        boolean lanczos = false;
        TermSelection[] tsels = new TermSelection[1];
        double[] tselths = new double[1];
        LocalWeight[] twlocals = new LocalWeight[1];
        GlobalWeight[] twglobals = new GlobalWeight[1];

        CorpusParameters parameters = new CorpusParameters();
        dims[0] = parameters.getDimensionalityReduction();
        dimths[0] = parameters.getDimensionalityReductionThreshold();
        lanczos = parameters.isLanczosSVD();
        tsels[0] = parameters.getTermSelectionCriterion();
        tselths[0] = parameters.getTermSelectionThreshold();
        twlocals[0] = parameters.getTermWeightLocal();
        twglobals[0] = parameters.getTermWeightGlobal();

        // If the ocpar option is given, load the parameters file and
        // override the default parameters
        if (line.hasOption("ocpar")) {
            parameters.loadFromFile(new File(line.getOptionValue("ocpar")));
            dims[0] = parameters.getDimensionalityReduction();
            dimths[0] = parameters.getDimensionalityReductionThreshold();
            lanczos = parameters.isLanczosSVD();
            tsels[0] = parameters.getTermSelectionCriterion();
            tselths[0] = parameters.getTermSelectionThreshold();
            twlocals[0] = parameters.getTermWeightLocal();
            twglobals[0] = parameters.getTermWeightGlobal();
        } else {
            // Check for every possible parameter
            if (line.hasOption("odim")) {
                dims = new DimensionalityReduction[line.getOptionValues("odim").length];
                for (int i = 0; i < dims.length; i++)
                    dims[i] = DimensionalityReduction.valueOf(line.getOptionValues("odim")[i]);
            }
            if (line.hasOption("odimth")) {
                dimths = new double[line.getOptionValues("odimth").length];
                for (int i = 0; i < dimths.length; i++)
                    dimths[i] = Double.parseDouble(line.getOptionValues("odimth")[i]);
            }
            if (line.hasOption("olanczos"))
                lanczos = true;
            else
                lanczos = false;
            if (line.hasOption("otsel")) {
                tsels = new TermSelection[line.getOptionValues("otsel").length];
                for (int i = 0; i < tsels.length; i++)
                    tsels[i] = TermSelection.valueOf(line.getOptionValues("otsel")[i]);
            }
            if (line.hasOption("otselth")) {
                tselths = new double[line.getOptionValues("otselth").length];
                for (int i = 0; i < tselths.length; i++)
                    tselths[i] = Double.parseDouble(line.getOptionValues("otselth")[i]);
            }
            if (line.hasOption("otwl")) {
                twlocals = new LocalWeight[line.getOptionValues("otwl").length];
                for (int i = 0; i < twlocals.length; i++)
                    twlocals[i] = LocalWeight.valueOf(line.getOptionValues("otwl")[i]);
            }
            if (line.hasOption("otwg")) {
                twglobals = new GlobalWeight[line.getOptionValues("otwg").length];
                for (int i = 0; i < twglobals.length; i++)
                    twglobals[i] = GlobalWeight.valueOf(line.getOptionValues("otwg")[i]);
            }
        }

        String resultsFolder = line.getOptionValue("oresults");
        FileWriter writer = null;
        if (resultsFolder != null) {
            File resultsFold = new File(resultsFolder);
            if (resultsFold.exists() && resultsFold.isDirectory()) {
                try {
                    File results = new File(resultsFolder + "/" + resultsFilename);
                    writer = new FileWriter(results);
                } catch (IOException e) {
                    logger.error(e);
                    writer = null;
                }
            }
        }

        // Create the whole combination of parameters
        for (TermSelection tsel : tsels)
            for (double tselth : tselths)
                for (LocalWeight lw : twlocals)
                    for (GlobalWeight gw : twglobals) {
                        CorpusParameters p = new CorpusParameters();
                        p.setTermSelectionCriterion(tsel);
                        p.setLanczosSVD(lanczos);
                        p.setTermSelectionCriterion(tsel);
                        p.setTermSelectionThreshold(tselth);
                        p.setTermWeightLocal(lw);
                        p.setTermWeightGlobal(gw);

                        logger.debug("Parameters to execute: " + p.toString());

                        SearchResultsCorpus backgroundKnowledgeCorpus = null;

                        // If we have background knowledge, load it
                        if (line.hasOption("obk")) {
                            backgroundKnowledgeCorpus = new SearchResultsCorpus(line.getOptionValue("obk"));
                            if (line.hasOption("obkpar")) {
                                CorpusParameters bkParameters = new CorpusParameters();
                                bkParameters.loadFromFile(new File(line.getOptionValue("obkpar")));
                                backgroundKnowledgeCorpus.setParameters(bkParameters);
                            }
                            try {
                                backgroundKnowledgeCorpus.load(repository);
                            } catch (Exception e) {
                                logger.error("Couldn't load background knowledge corpus.");
                                logger.error(e);
                                e.printStackTrace();
                                continue;
                            }
                        }

                        // Create the corpus with the query
                        for (String corpusQuery : corpusQueries) {
                            SearchResultsCorpus corpus = new SearchResultsCorpus(corpusQuery);

                            // Loading the corpus
                            try {
                                corpus.setParameters(p);
                                corpus.load(repository);
                            } catch (Exception e) {
                                logger.error("Couldn't load corpus. " + corpus.getLuceneQuery());
                                logger.error(e);
                                continue;
                            }

                            for (DimensionalityReduction dred : dims)
                                for (double dimth : dimths) {
                                    p.setDimensionalityReduction(dred);
                                    p.setDimensionalityReductionThreshold(dimth);
                                    try {
                                        corpus.getParameters().setDimensionalityReduction(dred);
                                        corpus.getParameters().setDimensionalityReductionThreshold(dimth);
                                        if (backgroundKnowledgeCorpus == null)
                                            corpus.getSemanticSpace().calculate();
                                    } catch (Exception e) {
                                        logger.error("Couldn't calculate corpus' semantic space");
                                        logger.error(e);
                                        e.printStackTrace();
                                        continue;
                                    }
                                    for (String operation : operations) {
                                        Class classDefinition = null;
                                        Operation op = null;
                                        try {
                                            classDefinition = Class
                                                    .forName("tml.vectorspace.operations." + operation);
                                            op = (Operation) classDefinition.newInstance();
                                        } catch (Exception e) {
                                            logger.error("The operation wasn't found");
                                            e.printStackTrace();
                                            logger.error(e);
                                            continue;
                                        }

                                        op.setCorpus(corpus);
                                        if (backgroundKnowledgeCorpus != null)
                                            op.setBackgroundKnowledgeCorpus(backgroundKnowledgeCorpus);
                                        try {
                                            op.start();
                                        } catch (Exception e) {
                                            logger.error("Error while performing the operation");
                                            e.printStackTrace();
                                            logger.error(e);
                                            continue;
                                        }

                                        String backgroundline = "None";
                                        String parametersline = corpus.getParameters().toString();
                                        if (backgroundKnowledgeCorpus != null) {
                                            backgroundline = backgroundKnowledgeCorpus.getLuceneQuery() + " ["
                                                    + backgroundKnowledgeCorpus.getSemanticSpace()
                                                            .getDimensionsKept()
                                                    + "]";
                                            parametersline = backgroundKnowledgeCorpus.getParameters().toString();
                                        } else {
                                            backgroundline += " [" + corpus.getSemanticSpace().getDimensionsKept()
                                                    + "]";
                                        }
                                        String corpusline = "Corpus:" + corpus.getLuceneQuery() + "\n"
                                                + "Operation:" + op.getClass().getName() + "\n" + "Background:"
                                                + backgroundline + "\n" + "Parameters:" + parametersline + "\n";

                                        if (writer != null) {
                                            try {
                                                writer.append(corpusline);
                                                writer.append(op.getResultsCSVString());
                                            } catch (IOException e) {
                                                logger.error("Error writing file " + corpusline);
                                                logger.error(e);
                                            }
                                        } else {
                                            System.out.println(corpusline);
                                            System.out.println(op.getResultsCSVString());
                                        }
                                    }
                                }
                        }
                    }

        if (writer != null) {
            try {
                writer.close();
            } catch (IOException e) {
                logger.error(e);
            }
        }
    }

    private static boolean startTML() {
        try {
            if (line.hasOption("v")) {
                PropertyConfigurator.configure(Configuration.getTmlProperties(true));
            } else {
                PropertyConfigurator.configure(Configuration.getTmlProperties());
            }
        } catch (IOException e1) {
            System.out.println("TML jar file is corrupt, please contact the author.");
            return false;
        }
        return true;
    }
}