com.joliciel.talismane.other.Extensions.java Source code

Java tutorial

Introduction

Here is the source code for com.joliciel.talismane.other.Extensions.java

Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2013 Assaf Urieli
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.talismane.other;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.joliciel.talismane.Talismane;
import com.joliciel.talismane.TalismaneConfig;
import com.joliciel.talismane.TalismaneException;
import com.joliciel.talismane.other.corpus.CorpusProjectifier;
import com.joliciel.talismane.other.corpus.CorpusStatistics;
import com.joliciel.talismane.other.corpus.CorpusModifier;
import com.joliciel.talismane.other.standoff.StandoffReader;
import com.joliciel.talismane.other.standoff.StandoffWriter;
import com.joliciel.talismane.output.FreemarkerTemplateWriter;
import com.joliciel.talismane.parser.ParserRegexBasedCorpusReader;
import com.joliciel.talismane.utils.LogUtils;

public class Extensions {
    private static final Log LOG = LogFactory.getLog(Extensions.class);
    String referenceStatsPath = null;
    String corpusRulesPath = null;
    ExtendedCommand command = null;

    public enum ExtendedCommand {
        toStandoff, toStandoffSentences, fromStandoff, splitConllFile, corpusStatistics, modifyCorpus, projectify
    }

    /**
     * To be called initially, so that any parameters specific to the extensions can be removed
     * and/or replaced in the argument map.
     * @param args
     */
    public void pluckParameters(Map<String, String> args) {
        if (args.containsKey("referenceStats")) {
            referenceStatsPath = args.get("referenceStats");
            args.remove("referenceStats");
        }
        if (args.containsKey("corpusRules")) {
            corpusRulesPath = args.get("corpusRules");
            args.remove("corpusRules");
        }

        if (args.containsKey("command")) {
            try {
                command = ExtendedCommand.valueOf(args.get("command"));
                args.remove("command");
                args.put("command", "process");
                args.put("module", "parse");
            } catch (IllegalArgumentException iae) {
                // do nothing
            }

        }
    }

    /**
     * To be called just before running the Talismane command, to
     * prepare anything specifically required for extensions to function correctly.
     * @param config
     * @param talismane
     */
    public void prepareCommand(TalismaneConfig config, Talismane talismane) {
        try {
            if (command == null)
                return;

            if (command.equals(ExtendedCommand.toStandoff)) {
                StandoffWriter standoffWriter = new StandoffWriter();
                talismane.setParseConfigurationProcessor(standoffWriter);
            } else if (command.equals(ExtendedCommand.toStandoffSentences)) {
                InputStream inputStream = StandoffWriter.class.getResourceAsStream("standoffSentences.ftl");
                Reader templateReader = new BufferedReader(new InputStreamReader(inputStream));
                FreemarkerTemplateWriter templateWriter = new FreemarkerTemplateWriter(templateReader);

                talismane.setParseConfigurationProcessor(templateWriter);
            } else if (command.equals(ExtendedCommand.fromStandoff)) {
                Scanner scanner = new Scanner(config.getReader());
                StandoffReader standoffReader = new StandoffReader(scanner);
                config.setParserCorpusReader(standoffReader);
            } else if (command.equals(ExtendedCommand.corpusStatistics)) {
                CorpusStatistics stats = new CorpusStatistics();

                if (referenceStatsPath != null) {
                    File referenceStatsFile = new File(referenceStatsPath);
                    CorpusStatistics referenceStats = CorpusStatistics.loadFromFile(referenceStatsFile);
                    stats.setReferenceWords(referenceStats.getWords());
                    stats.setReferenceLowercaseWords(referenceStats.getLowerCaseWords());
                }

                File csvFile = new File(config.getOutDir(), config.getBaseName() + "_stats.csv");
                csvFile.delete();
                csvFile.createNewFile();
                Writer csvFileWriter = new BufferedWriter(
                        new OutputStreamWriter(new FileOutputStream(csvFile, false), "UTF8"));
                stats.setWriter(csvFileWriter);

                File serializationFile = new File(config.getOutDir(), config.getBaseName() + "_stats.zip");
                serializationFile.delete();
                stats.setSerializationFile(serializationFile);

                ParserRegexBasedCorpusReader corpusReader = (ParserRegexBasedCorpusReader) config
                        .getParserCorpusReader();
                corpusReader.setPredictTransitions(false);

                talismane.setParseConfigurationProcessor(stats);
            } else if (command.equals(ExtendedCommand.modifyCorpus)) {
                if (corpusRulesPath == null)
                    throw new TalismaneException("corpusRules is required for modifyCorpus command");

                List<String> corpusRules = new ArrayList<String>();
                File corpusRulesFile = new File(corpusRulesPath);
                Scanner scanner = new Scanner(
                        new BufferedReader(new InputStreamReader(new FileInputStream(corpusRulesFile), "UTF-8")));

                while (scanner.hasNextLine()) {
                    corpusRules.add(scanner.nextLine());
                }
                CorpusModifier corpusModifier = new CorpusModifier(config.getParseConfigurationProcessor(),
                        corpusRules);
                talismane.setParseConfigurationProcessor(corpusModifier);

            } else if (command.equals(ExtendedCommand.projectify)) {
                CorpusProjectifier projectifier = new CorpusProjectifier(config.getParseConfigurationProcessor());
                talismane.setParseConfigurationProcessor(projectifier);
            } else {
                throw new RuntimeException("Unknown command: " + command);
            }
        } catch (IOException e) {
            LogUtils.logError(LOG, e);
            throw new RuntimeException(e);
        }
    }

    public ExtendedCommand getCommand() {
        return command;
    }

}