eu.project.ttc.tools.cli.TermSuiteAlignerCLI.java Source code

Java tutorial

Introduction

Here is the source code for eu.project.ttc.tools.cli.TermSuiteAlignerCLI.java

Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright 2, 2015nership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package eu.project.ttc.tools.cli;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Optional;
import com.google.common.base.Splitter;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Lists;

import eu.project.ttc.api.JsonOptions;
import eu.project.ttc.engines.BilingualAligner;
import eu.project.ttc.engines.BilingualAligner.TranslationCandidate;
import eu.project.ttc.metrics.Cosine;
import eu.project.ttc.metrics.Jaccard;
import eu.project.ttc.metrics.SimilarityDistance;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.index.JsonTermIndexIO;
import eu.project.ttc.tools.TermSuiteAlignerBuilder;

/**
 * Command line interface for the Terminology extraction (Spotter+Indexer) engines.
 * 
 * @author Damien Cram
 */
public class TermSuiteAlignerCLI {
    private static final Logger LOGGER = LoggerFactory.getLogger(TermSuiteAlignerCLI.class);

    /** Short usage description of the CLI */
    private static final String USAGE = "java [-DconfigFile=<file>] -Xms1g -Xmx2g -cp termsuite-core-x.x.jar eu.project.ttc.tools.cli.TermSuiteAlignerCLI";

    /// Parameter names
    private static final String TERM = "term";
    private static final String N = "n";
    private static final String TERM_LIST = "term-list";
    private static final String SOURCE_TERMINO = "source-termino";
    private static final String TARGET_TERMINO = "target-termino";
    private static final String DICTIONARY = "dictionary";
    private static final String DISTANCE = "distance";
    private static final String EXPLAIN = "explain";

    // Parameter options
    private static final String DISTANCE_COSINE = "cosine";
    private static final String DISTANCE_JACCARD = "jaccard";

    // values

    private Optional<TermIndex> sourceTermino = Optional.absent();
    private Optional<TermIndex> targetTermino = Optional.absent();
    private String dicoPath;
    private int n = 10;
    private List<String> terms = Lists.newArrayList();
    private SimilarityDistance distance = new Cosine();
    private boolean showExplanation = false;

    /**
     * Application entry point
     * 
     * @param args
     *            Command line arguments
      * @throws UnsupportedEncodingException 
     */
    public static void main(String[] args) throws UnsupportedEncodingException {
        new TermSuiteAlignerCLI().run(args, System.out);
    }

    public void run(String[] args, PrintStream out) {
        File logDir = new File("logs");
        if (!logDir.exists())
            logDir.mkdir();
        String logPath = Paths
                .get("logs",
                        "termsuite-aligner-" + new SimpleDateFormat("yyyyMMdd-HHmmss").format(new Date()) + ".log")
                .toAbsolutePath().toString();
        TermSuiteCLIUtils.logToFile(logPath);
        Stopwatch sw = Stopwatch.createStarted();
        LOGGER.info("Logging to {}", logPath);
        try {

            // usage
            // java -DconfigFile=myPropertiesFileName -Xms1g  -Xmx2g -cp ttc-term-suite-1.3.jar eu.project.ttc.tools.cli.TermSuiteSpotterCLI
            // if the option -DconfigFile is missing preferencesFileName is set to TermSuiteCLIUtils.USER_HOME+PREFERENCES_FILE_NAME
            // create the command line parser
            PosixParser parser = new PosixParser();

            // create the Options
            Options options = declareOptions();

            try {
                // Parse and set CL options
                CommandLine line = parser.parse(options, args, false);
                readArguments(line, out);
                TermSuiteCLIUtils.setGlobalLogLevel("info");
                TermSuiteCLIUtils.logCommandLineOptions(line);

                BilingualAligner aligner = TermSuiteAlignerBuilder.start().setSourceTerminology(sourceTermino.get())
                        .setTargetTerminology(targetTermino.get()).setDicoPath(dicoPath).setDistance(distance)
                        .create();

                for (String term : terms) {
                    Term sourceTerm = readSourceTerm(term);
                    if (sourceTerm == null) {
                        LOGGER.error("Cannot find term \"{}\" in {}", term, line.getOptionValue(SOURCE_TERMINO));
                    } else {
                        if (terms.size() > 1) {
                            out.println("---");
                            out.println(sourceTerm);
                            out.println("-");
                        }
                        for (TranslationCandidate candidate : aligner.align(sourceTerm, n, 1)) {
                            if (showExplanation)
                                out.format("%s\t%.3f\t%s\t%s\n", candidate.getTerm(), candidate.getScore(),
                                        candidate.getMethod(), candidate.getExplanation().getText());
                            else
                                out.format("%s\t%.3f\t%s\n", candidate.getTerm(), candidate.getScore(),
                                        candidate.getMethod());
                        }
                    }
                }

                LOGGER.info("Script executed in " + sw.toString());

            } catch (ParseException e) {
                TermSuiteCLIUtils.printUsage(e, USAGE, options);
            }

        } catch (Exception e) {
            e.printStackTrace(System.err);
            LOGGER.error(e.getMessage());
        }
    }

    private Term readSourceTerm(String term) {
        for (Term t : sourceTermino.get().getTerms()) {
            if (t.getGroupingKey().equals(term) || t.getPilot().equals(term) || t.getLemma().equals(term)
                    || t.getPilot().equals(term.toLowerCase()) || t.getLemma().equals(term.toLowerCase()))
                return t;
        }
        return null;
    }

    private Options declareOptions() {
        Options options = new Options();

        options.addOption(
                TermSuiteCLIUtils.createOption(null, SOURCE_TERMINO, true, "Source terminology (json file)", true));

        options.addOption(
                TermSuiteCLIUtils.createOption(null, TARGET_TERMINO, true, "Target terminology (json file)", true));

        options.addOption(TermSuiteCLIUtils.createOption(null, TERM, true, "Source term to align", false));

        options.addOption(TermSuiteCLIUtils.createOption(null, TERM_LIST, true,
                "File containing a list of source terms to align (one per line)", false));

        options.addOption(TermSuiteCLIUtils.createOption(null, DICTIONARY, true, "Alignment dictionaries", true));

        options.addOption(TermSuiteCLIUtils.createOption(N, N, true,
                "The number of translation candidates to show in the output", false));

        options.addOption(TermSuiteCLIUtils.createOption(null, DISTANCE, true,
                "Similarity measure to compute the distance between two vectors [" + DISTANCE_COSINE + ","
                        + DISTANCE_JACCARD + "]",
                false));

        options.addOption(TermSuiteCLIUtils.createOption(null, EXPLAIN, false,
                "Shows for each aligned term the most influencial co-terms", false));

        return options;
    }

    public void readArguments(CommandLine line, PrintStream out) throws IOException {
        if (!line.hasOption(TERM) && !line.hasOption(TERM_LIST)) {
            String msg = String.format("ERROR: One option of --%s or --%s must be provided.", TERM, TERM_LIST);
            LOGGER.error(msg);
            System.err.flush();
            out.flush();
            System.err.println(msg);
            System.exit(1);
        }
        if (line.hasOption(TERM))
            terms.add(line.getOptionValue(TERM));
        if (line.hasOption(TERM_LIST)) {
            File file = new File(line.getOptionValue(TERM_LIST));
            for (String term : FileUtils.readLines(file, "UTF-8"))
                terms.add(Splitter.on("\t").splitToList(term).get(0).trim());
        }
        if (line.hasOption(N))
            n = Integer.parseInt(line.getOptionValue(N));
        if (line.hasOption(DISTANCE)) {
            if (line.getOptionValue(DISTANCE).equals(DISTANCE_COSINE))
                distance = new Cosine();
            else if (line.getOptionValue(DISTANCE).equals(DISTANCE_JACCARD))
                distance = new Jaccard();
            else
                TermSuiteCLIUtils.exitWithErrorMessage(String.format("Unknown distance: %s. Allowed values: %s;%s",
                        line.getOptionValue(DISTANCE), DISTANCE_COSINE, DISTANCE_JACCARD));

        }
        LOGGER.info("loading source termino {}", line.getOptionValue(SOURCE_TERMINO));
        JsonOptions loadOptions = new JsonOptions().withContexts(true);
        sourceTermino = Optional
                .of(JsonTermIndexIO.load(new FileReader(line.getOptionValue(SOURCE_TERMINO)), loadOptions));
        LOGGER.info("loading target termino {}", line.getOptionValue(TARGET_TERMINO));
        targetTermino = Optional
                .of(JsonTermIndexIO.load(new FileReader(line.getOptionValue(TARGET_TERMINO)), loadOptions));
        dicoPath = line.getOptionValue(DICTIONARY);

        showExplanation = line.hasOption(EXPLAIN);
    }

}