edu.ehu.galan.lite.Lite.java Source code

Java tutorial

Introduction

Here is the source code for edu.ehu.galan.lite.Lite.java

Source

/*
 * Copyright (C) 2014 Angel Conde Manjon neuw84 at gmail.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package edu.ehu.galan.lite;

import edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDFAlgorithm;
import edu.ehu.galan.lite.algorithms.ranked.unsupervised.cvalue.CValueAlgortithm;
import edu.ehu.galan.lite.algorithms.ranked.unsupervised.cvalue.filters.english.AdjPrepNounFilter;
import edu.ehu.galan.lite.algorithms.ranked.unsupervised.kpminer.KPMinerAlgorithm;
import edu.ehu.galan.lite.algorithms.ranked.unsupervised.rake.RakeAlgorithm;
import edu.ehu.galan.lite.algorithms.unranked.supervised.freeLingNerEn.FreeLingNerAlgorithm;
import edu.ehu.galan.lite.algorithms.unranked.supervised.shallowParsingGrammar.cg3.ShallowParsingGrammarAlgortithm;
import edu.ehu.galan.lite.mixer.data.wikipedia.WikipediaData;
import edu.ehu.galan.lite.mixer.disambiguation.cValueWikiminerDisambiguation.CValueWikiDisambiguator;
import edu.ehu.galan.lite.mixer.mapping.wikipedia.wikiminer.WikiMinerMap;
import edu.ehu.galan.lite.mixer.relatedness.CValueWikiMinerRelationship.CValueWikiRelationship;
import edu.ehu.galan.lite.mixer.utils.DuplicateRemoval;
import edu.ehu.galan.lite.model.Corpus;
import edu.ehu.galan.lite.model.Document;
import edu.ehu.galan.lite.parsers.AbstractDocumentReader;
import edu.ehu.galan.lite.parsers.english.PlainTextDocumentReaderIXAEn;
import edu.ehu.galan.lite.parsers.spanish.PlainTextDocumentReaderIXAEs;
import edu.ehu.galan.lite.stemmers.CaseStemmer;
import edu.ehu.galan.lite.utils.AlgorithmRunner;
import edu.ehu.galan.lite.utils.wikiminer.WikiminnerHelper;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

/**
 * Class that implements a command line interface for LiTe
 *
 * @author Angel Conde Manjon
 */
public class Lite {

    private static Cache cache;

    /**
     * Main entry point for LiTe from the command Line use -h option for help
     *
     * @param args
     */
    public static void main(String[] args) {

        // create Options object
        Options options = new Options();
        Option corpus = new Option("c", true,
                "the location (directory) of the corpus to process, containing only one document at the moment");
        Option language = new Option("l", true,
                "the language of the corpus (lowercase ISO format, for example 'en'");
        Option algorithms = new Option("a", true, "the algorithms you want to process separated by commas: \n"
                + "see the documentation for see the available algorithms for each language");
        Option help = new Option("h", false, "print this message");
        Option resources = new Option("r", true, "the location of the lite resources folder");
        Option output = new Option("o", true,
                "The directory where the results will be stored, by default the one where the vm has been launched");
        Option listAlgs = new Option("listAlgs", false,
                " Algorithm list names with the supported languages (remember that the cvalue will be processed chosen or not):\n"
                        + "===================================================\n"
                        + "tfidf => processes the TFIDF algorithm, process terms of the input document using the Wikipedia corpus as IDF (en,es)\n"
                        + "cvalue => processes the CValue altorithm for the inputdocument, CValue is processed whether is chosed or not! (en, es)\n"
                        + "shallow => processes the shallow parsing grammar algorithm (en)\n"
                        + "rake => processes the rake algorithm (language agnostig)\n"
                        + "kpminer => processes the KPMiner algorithm (en)\n"
                        + "chisquare => processes the ChiSquare using the NLTK toolkit (language agnostic)\n"
                        + "pmi=> processes the Point Mutual Information using the NLTK toolkit (language agnostic)\n"
                        + "likehood=> processes the Likehood Ratio using the NLTK toolkit (language agnostic)\n"
                        + "tstudent=> processes the T-Student using the NLTK toolkit (language agnostic)\n"
                        + "rawfreq=> processes the raw frequency algorithm using the NLTK toolkit (language agnostic)\n"
                        + "freelingner=> processes the FreeLing ner algorithm via external call(es, en)\n");
        options.addOption(resources);
        options.addOption(corpus);
        options.addOption(language);
        options.addOption(algorithms);
        options.addOption(listAlgs);
        options.addOption(output);
        // automatically generate the help statement
        HelpFormatter formatter = new HelpFormatter();
        String[] s = new String[] {};
        // create the parser
        CommandLineParser parser = new org.apache.commons.cli.GnuParser();
        try {
            // parse the command line arguments
            CommandLine line = parser.parse(options, args);
            if (line.hasOption('c') && line.hasOption('l') && line.hasOption('a') && line.hasOption('r')) {
                if (!line.getOptionValue('l').equals("en") && !line.getOptionValue("l").equals("es")) {
                    System.out.println(
                            "Supported languages \"en\" or \"es\", however you may use the statistical algorithms via the API");
                } else {
                    String outputDir = "";
                    if (line.hasOption('o')) {
                        outputDir = line.getOptionValue('o');
                    }
                    String lang = line.getOptionValue('l');
                    Corpus cor = new Corpus(line.getOptionValue('l'));
                    cor.loadCorpus(line.getOptionValue('c'), Document.SourceType.wikipedia);
                    String res = line.getOptionValue('r');
                    List<String> algs = Arrays.asList(line.getOptionValue('a').split(","));
                    System.out.println("Processing.... (it may take a while...)");
                    runner(lang, res + File.separator, algs, cor, outputDir);
                }
            }
            if (line.hasOption('c') && line.hasOption('l') && line.hasOption('r')) {
                if (!line.getOptionValue('l').equals("en") && !line.getOptionValue("l").equals("es")) {
                    System.out.println(
                            "Supported languages \"en\" or \"es\", however you may use the statistical algorithms via the API");
                } else {
                    String outputDir = "";
                    if (line.hasOption('o')) {
                        outputDir = line.getOptionValue('o');
                    }
                    System.out.println(
                            "Processing with default algorithms (TFIDF/CValue).... (it may take a while...)");
                    Corpus cor = new Corpus(line.getOptionValue('l'));
                    String res = line.getOptionValue('r');
                    cor.loadCorpus(line.getOptionValue('c'), Document.SourceType.wikipedia);
                    String lang = line.getOptionValue('l');
                    List<String> algos = null;
                    switch (lang) {
                    case "es": {
                        String[] algs = { "cvalue", "tfidf", "rake" };
                        algos = Arrays.asList(algs);
                        break;
                    }
                    case "en": {
                        String[] algs = { "cvalue", "tfidf", "rake" };
                        algos = Arrays.asList(algs);
                        break;
                    }
                    }
                    runner(lang, res + File.separator, algos, cor, outputDir);
                }
            } else if (line.hasOption("h")) {
                formatter.printHelp("LiTe: a language indepent term extractor", options);
            } else if (line.getOptions().length == 0) {
                formatter.printHelp("LiTe: a language indepent term extractor", options);
            } else {
                System.err.println("The 'c', 'l' and 'r' arguments are required \n");
                formatter.printHelp("LiTe: a language indepent term extractor", options);
            }
        } catch (ParseException exp) {
            // oops, something went wrong
            System.err.println("Parsing failed.  Reason: " + exp.getMessage() + "\n");
            formatter.printHelp("LiTe: a language indepent term extractor", options);

        }

    }

    private static void runner(String lang, String resources, List<String> algs, Corpus corpus, String outDir) {
        System.setProperty("net.sf.ehcache.enableShutdownHook", "true");
        if (CacheManager.getCacheManager("ehcacheLitet.xml") == null) {
            CacheManager.create("ehcacheLitet.xml");
        }
        Properties props = new Properties();
        try {
            props.load(new FileInputStream(resources + "lite/configs/general.conf"));
        } catch (IOException ex) {
            System.err.println("Check the resources dir: " + ex.getMessage());
        }
        AbstractDocumentReader parser = null;
        AlgorithmRunner runner = new AlgorithmRunner();
        CValueAlgortithm cvalue = new CValueAlgortithm();
        switch (lang) {
        case "en":
            cvalue.addNewProcessingFilter(new AdjPrepNounFilter());
            parser = new PlainTextDocumentReaderIXAEn();
            break;
        case "es":
            cvalue.addNewProcessingFilter(
                    new edu.ehu.galan.lite.algorithms.ranked.unsupervised.cvalue.filters.spanish.NounAdjOpenFilter());
            parser = new PlainTextDocumentReaderIXAEs();
            break;
        }
        runner.submitAlgorithm(cvalue);
        //TODO: do this via java reflection
        for (int i = 0; i < algs.size(); i++) {
            switch (algs.get(i)) {
            case "TFIDF": {
                TFIDFAlgorithm tf = new TFIDFAlgorithm(new CaseStemmer(CaseStemmer.CaseType.lowercase), lang);
                runner.submitAlgorithm(tf);
                break;
            }
            case "FreeLing NER": {
                FreeLingNerAlgorithm alg = null;
                switch (lang) {
                case "en":
                    alg = new FreeLingNerAlgorithm(resources + "lite" + File.separator + "configs" + File.separator
                            + "freeling" + File.separator + "enPOSMW.cfg");
                    break;
                case "es":
                    alg = new FreeLingNerAlgorithm(resources + "lite" + File.separator + "configs" + File.separator
                            + "freeling" + File.separator + "esPOSMW.cfg");
                    break;
                }
                runner.submitAlgorithm(alg);
                break;
            }
            case "KP-Miner": {
                if (lang.equals("en")) {
                    KPMinerAlgorithm kp = new KPMinerAlgorithm();
                    runner.submitAlgorithm(kp);
                }
                break;
            }
            case "Shallow Parsing Grammar": {
                if (lang.equals("en")) {
                    ShallowParsingGrammarAlgortithm a = new ShallowParsingGrammarAlgortithm(resources + "lite"
                            + File.separator + "grammars" + File.separator + "Cg2EnGrammar.grammar",
                            props.getProperty("tmpDir") + File.separator + "cg3");
                    runner.submitAlgorithm(a);
                }
                break;
            }
            case "RAKE": {
                RakeAlgorithm ex = new RakeAlgorithm();
                switch (lang) {
                case "en":
                    ex.loadStopWordsList(resources + "lite/stopWordLists/RakeStopLists/SmartStopListEn");
                    break;
                case "es":
                    ex.loadStopWordsList(resources + "lite/stopWordLists/RakeStopLists/SpanishCustomEs");
                    break;
                }
                ex.loadPunctStopWord(resources + "lite/stopWordLists/RakeStopLists/RakePunctDefaultStopList");
                runner.submitAlgorithm(ex);
                break;
            }
            }
        }
        //load stop list
        List<String> standardStop = null;
        try {
            standardStop = Files.readAllLines(Paths.get(resources + "lite/stopWordLists/standardStopList"),
                    StandardCharsets.UTF_8);

        } catch (IOException e1x) {
            System.err.println("Check your resources dir: " + e1x.getMessage());
        }
        WikiminnerHelper helper = WikiminnerHelper.getInstance(resources);
        helper.setLanguage(lang);
        //we may operate in local mode (using Wikiminer as API instead of interacting via REST api        
        helper.setLocalMode(props.getProperty("localMode").equals("true"),
                "/home/angel/nfs/wikiminer/configs/wikipedia");
        WikiMinerMap wikimapping = new WikiMinerMap(resources, helper);
        CValueWikiDisambiguator disambiguator = new CValueWikiDisambiguator(resources, helper);
        CValueWikiRelationship relate = new CValueWikiRelationship(resources, helper);
        WikipediaData data = new WikipediaData(resources, helper);
        if (!(props.getProperty("localMode")).equals("true")) {
            helper.openConnection();
        }
        helper.openConnection();
        while (!corpus.getDocQueue().isEmpty()) {
            Document doc = corpus.getDocQueue().poll();
            doc.setSource(Document.SourceType.wikipedia);
            parser.readSource(doc.getPath());
            doc.setSentenceList(parser.getSentenceList());
            doc.setTokenList(parser.getTokenizedSentenceList());
            System.out.println(doc.getName());
            runner.runAlgorihms(doc, resources);
            doc.applyGlobalStopWordList(standardStop);
            doc.mapThreshold(1.9f, new String[] { "CValue" });
            doc.mapThreshold(0.00034554f, new String[] { "TFIDF" });
            doc.removeAndMixTerms();
            //map document
            wikimapping.mapCorpus(doc);
            disambiguator.disambiguateTopics(doc);
            //we may disambiguate topics that do not disambiguated correctly
            DuplicateRemoval.disambiguationRemoval(doc);
            DuplicateRemoval.topicDuplicateRemoval(doc);
            //obtain the wiki links,labels, etc
            data.processDocument(doc);
            //measure domain relatedness
            relate.relate(doc);
            //save the results
            Document.saveJsonToDir(outDir, doc);
        }
        if (props.getProperty("localMode").equals("true")) {
            helper.closeWikipedia();
        } else {
            helper.closeConnection();
        }
        CacheManager.getInstance().shutdown();
        System.exit(0);
    }
}