eu.edisonproject.training.execute.Main.java Source code

Introduction

Here is the source code for eu.edisonproject.training.execute.Main.java
Source

/*
 * Copyright 2016 S. Koulouzis.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.edisonproject.training.execute;

import com.google.common.io.Files;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import eu.edisonproject.training.context.corpus.DataPrepare;
import eu.edisonproject.training.term.extraction.TermExtractor;
import eu.edisonproject.training.tfidf.mapreduce.ITFIDFDriver;
import eu.edisonproject.training.tfidf.mapreduce.TFIDFDriverImpl;
import eu.edisonproject.training.tfidf.mapreduce.TFIDFTermsDriver;
import eu.edisonproject.training.wsd.DisambiguatorImpl;
import eu.edisonproject.training.wsd.MetaDisambiguator;
import eu.edisonproject.utility.commons.Term;
import eu.edisonproject.utility.commons.TermAvroSerializer;
import eu.edisonproject.utility.file.ConfigHelper;
import eu.edisonproject.utility.file.MyProperties;
import eu.edisonproject.utility.text.processing.StanfordLemmatizer;
import eu.edisonproject.utility.text.processing.StopWord;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.util.CharArraySet;

/**
 * Example usage for training : -op t -i
 * $HOME/Downloads/Databases/dictionary.csv \ -o
 * $HOME/Downloads/D2.1_Table5_Skills_and_knowledge_Big_Data_platforms_and_tools/Databases/Databases.avro
 *
 * for term extraction:
 *
 *
 * @author S. Koulouzis
 */
public class Main {

    private static MyProperties prop;

    public static void main(String args[]) {
        Options options = new Options();
        Option operation = new Option("op", "operation", true,
                "type of operation to perform. " + "For term extraction use 'x'.\n"
                        + "Example: -op x -i E-COCO/documentation/sampleTextFiles/databases.txt "
                        + "-o E-COCO/documentation/sampleTextFiles/databaseTerms.csv"
                        + "For word sense disambiguation use 'w'.\n"
                        + "Example: -op w -i E-COCO/documentation/sampleTextFiles/databaseTerms.csv "
                        + "-o E-COCO/documentation/sampleTextFiles/databse.avro\n"
                        + "For tf-idf vector extraction use 't'.\n" + "For running the apriori algorithm use 'a'");
        operation.setRequired(true);
        options.addOption(operation);

        Option input = new Option("i", "input", true, "input file path");
        input.setRequired(true);
        options.addOption(input);

        Option output = new Option("o", "output", true, "output file");
        output.setRequired(true);
        options.addOption(output);

        Option popertiesFile = new Option("p", "properties", true, "path for a properties file");
        popertiesFile.setRequired(false);
        options.addOption(popertiesFile);

        Option termsFile = new Option("t", "terms", true, "terms file");
        termsFile.setRequired(false);
        options.addOption(termsFile);

        String helpmasg = "Usage: \n";
        for (Object obj : options.getOptions()) {
            Option op = (Option) obj;
            helpmasg += op.getOpt() + ", " + op.getLongOpt() + "\t Required: " + op.isRequired() + "\t\t"
                    + op.getDescription() + "\n";
        }

        try {
            CommandLineParser parser = new BasicParser();
            CommandLine cmd = parser.parse(options, args);

            String propPath = cmd.getOptionValue("properties");
            if (propPath == null) {
                prop = ConfigHelper
                        .getProperties(".." + File.separator + "etc" + File.separator + "configure.properties");
            } else {
                prop = ConfigHelper.getProperties(propPath);
            }
            //            ${user.home}

            switch (cmd.getOptionValue("operation")) {
            case "x":
                termExtraction(cmd.getOptionValue("input"), cmd.getOptionValue("output"));
                break;
            case "w":
                wsd(cmd.getOptionValue("input"), cmd.getOptionValue("output"));
                break;
            case "t":
                calculateTFIDF(cmd.getOptionValue("input"), cmd.getOptionValue("output"));
                break;
            //                case "tt":
            //                    calculateTermTFIDF(cmd.getOptionValue("input"), cmd.getOptionValue("terms"), cmd.getOptionValue("output"));
            //                    break;
            case "a":
                apriori(cmd.getOptionValue("input"), cmd.getOptionValue("output"));
                break;
            default:
                System.out.println(helpmasg);
            }

        } catch (Exception ex) {
            Logger.getLogger(Main.class.getName()).log(Level.SEVERE, helpmasg, ex);
        }
    }

    private static void termExtraction(String docs, String out) throws ClassNotFoundException,
            InstantiationException, IllegalAccessException, IOException, InterruptedException {
        if (!new File(docs).exists()) {
            throw new IOException(new File(docs).getAbsolutePath() + " don't exist");
        }
        //
        String[] extractors = prop.getProperty("term.extractors",
                "eu.edisonproject.training.term.extraction.LuceneExtractor,"
                        + "eu.edisonproject.training.term.extraction.JtopiaExtractor,"
                        + "eu.edisonproject.training.term.extraction.AprioriExtraction")
                .split(",");

        //        String[] extractors = prop.getProperty("term.extractors",
        //                "eu.edisonproject.training.term.extraction.JtopiaExtractor,"
        //                + "eu.edisonproject.training.term.extraction.AprioriExtraction").split(",");
        //        String[] extractors = "eu.edisonproject.training.term.extraction.JtopiaExtractor".split(",");
        Map<String, Double> termDictionaray = new HashMap();
        for (String className : extractors) {
            Class c = Class.forName(className);
            Object obj = c.newInstance();
            TermExtractor termExtractor = (TermExtractor) obj;
            termExtractor.configure(prop);
            termDictionaray.putAll(termExtractor.termXtraction(docs));
        }

        writeDictionary2File(termDictionaray, out);
        calculateTermTFIDF(docs, out, out);
    }

    public static void writeDictionary2File(Map<String, Double> keywordsDictionaray,
            String outkeywordsDictionarayFile) throws FileNotFoundException {
        //        ValueComparator bvc = new ValueComparator(keywordsDictionaray);
        //        Map<String, Double> sorted_map = new TreeMap(bvc);
        //        sorted_map.putAll(keywordsDictionaray);
        try (PrintWriter out = new PrintWriter(outkeywordsDictionarayFile)) {
            for (String key : keywordsDictionaray.keySet()) {
                Double value = keywordsDictionaray.get(key);
                key = key.toLowerCase().trim().replaceAll(" ", "_");
                if (key.endsWith("_")) {
                    key = key.substring(0, key.lastIndexOf("_"));
                }

                out.print(key + "," + value + "\n");
            }
        }
    }

    private static void wsd(String in, String out)
            throws IOException, FileNotFoundException, org.json.simple.parser.ParseException {
        DisambiguatorImpl d = new MetaDisambiguator();
        d.configure(prop);
        List<Term> terms = d.disambiguateTerms(in);
        saveTerms2Avro(terms, out);

    }

    private static void saveTerms2Avro(List<Term> terms, String out) {
        TermAvroSerializer ts = new TermAvroSerializer(out, Term.getClassSchema());
        List<CharSequence> empty = new ArrayList<>();
        empty.add("");
        //        Stemming stemer = new Stemming();
        String stopWordsPath = System.getProperty("stop.words.file");

        if (stopWordsPath == null) {
            stopWordsPath = prop.getProperty("stop.words.file",
                    ".." + File.separator + "etc" + File.separator + "stopwords.csv");
        }

        CharArraySet stopwordsCharArray = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
        StopWord tokenizer = new StopWord(stopwordsCharArray);
        StanfordLemmatizer lematizer = new StanfordLemmatizer();

        for (Term t : terms) {
            List<CharSequence> nuid = t.getNuids();
            if (nuid == null || nuid.isEmpty() || nuid.contains(null)) {
                t.setNuids(empty);
            }

            List<CharSequence> buids = t.getBuids();
            if (buids == null || buids.isEmpty() || buids.contains(null)) {
                t.setBuids(empty);
            }
            List<CharSequence> alt = t.getAltLables();
            if (alt == null || alt.isEmpty() || alt.contains(null)) {
                t.setAltLables(empty);
            }
            List<CharSequence> gl = t.getGlosses();
            if (gl == null || gl.isEmpty() || gl.contains(null)) {
                ArrayList<CharSequence> lem = new ArrayList<>();
                lem.add(t.lemma);
                t.setGlosses(lem);
            } else {
                StringBuilder glosses = new StringBuilder();
                for (CharSequence n : gl) {
                    glosses.append(n).append(" ");
                }
                glosses.append(t.lemma.toString().replaceAll("_", " "));
                if (alt != null && !alt.isEmpty() && !alt.contains(null)) {
                    for (CharSequence c : alt) {
                        glosses.append(c.toString().replaceAll("_", " ")).append(" ");
                    }
                }
                gl = new ArrayList<>();
                tokenizer.setDescription(glosses.toString());
                String cleanText = tokenizer.execute();
                lematizer.setDescription(cleanText);
                String lematizedText = lematizer.execute();

                gl.add(lematizedText);
                t.setGlosses(gl);

            }
            List<CharSequence> cat = t.getCategories();
            if (cat == null || cat.contains(null)) {
                t.setCategories(empty);
            }
            ts.serialize(t);
        }
        ts.close();

    }

    private static void calculateTFIDF(String in, String out) throws IOException {
        File tmpFolder = null;
        try {
            String contextName = FilenameUtils.removeExtension(in.substring(in.lastIndexOf(File.separator) + 1));
            ITFIDFDriver tfidfDriver = new TFIDFDriverImpl(contextName);
            File inFile = new File(in);

            String workingFolder = System.getProperty("working.folder");
            if (workingFolder == null) {
                workingFolder = prop.getProperty("working.folder", System.getProperty("java.io.tmpdir"));
            }

            tmpFolder = new File(workingFolder + File.separator + System.currentTimeMillis());

            tmpFolder.mkdir();
            tmpFolder.deleteOnExit();

            setTFIDFDriverImplPaths(inFile, tmpFolder);

            tfidfDriver.executeTFIDF(tmpFolder.getAbsolutePath());
            tfidfDriver.driveProcessResizeVector();
            File ctxPath = new File(TFIDFDriverImpl.CONTEXT_PATH);
            for (File f : ctxPath.listFiles()) {
                if (FilenameUtils.getExtension(f.getName()).endsWith("csv")) {
                    FileUtils.moveFile(f, new File(out + File.separator + f.getName()));
                }
            }
        } finally {
            if (tmpFolder != null && tmpFolder.exists()) {
                tmpFolder.delete();
                FileUtils.forceDelete(tmpFolder);
            }
        }
    }

    private static void setTFIDFDriverImplPaths(File inFile, File tmpFolder) throws IOException {

        TFIDFDriverImpl.INPUT_ITEMSET = System.getProperty("itemset.file");
        if (TFIDFDriverImpl.INPUT_ITEMSET == null) {
            TFIDFDriverImpl.INPUT_ITEMSET = prop.getProperty("itemset.file",
                    ".." + File.separator + "etc" + File.separator + "itemset.csv");
        }

        File outPath1 = new File(TFIDFDriverImpl.OUTPUT_PATH1);
        TFIDFDriverImpl.OUTPUT_PATH1 = tmpFolder.getAbsolutePath() + File.separator + outPath1.getName();

        File inPath2 = new File(TFIDFDriverImpl.INPUT_PATH2);
        TFIDFDriverImpl.INPUT_PATH2 = tmpFolder.getAbsolutePath() + File.separator + inPath2.getName();

        File outPath2 = new File(TFIDFDriverImpl.OUTPUT_PATH2);
        TFIDFDriverImpl.OUTPUT_PATH2 = tmpFolder.getAbsolutePath() + File.separator + outPath2.getName();

        File inPath3 = new File(TFIDFDriverImpl.INPUT_PATH3);
        TFIDFDriverImpl.INPUT_PATH3 = tmpFolder.getAbsolutePath() + File.separator + inPath3.getName();

        File outPath3 = new File(TFIDFDriverImpl.OUTPUT_PATH3);
        TFIDFDriverImpl.OUTPUT_PATH3 = tmpFolder.getAbsolutePath() + File.separator + outPath3.getName();

        File inPath4 = new File(TFIDFDriverImpl.INPUT_PATH4);
        TFIDFDriverImpl.INPUT_PATH4 = tmpFolder.getAbsolutePath() + File.separator + inPath4.getName();

        File outPath4 = new File(TFIDFDriverImpl.OUTPUT_PATH4);
        TFIDFDriverImpl.OUTPUT_PATH4 = tmpFolder.getAbsolutePath() + File.separator + outPath4.getName();

        File tiidfCSV = new File(TFIDFDriverImpl.TFIDFCSV_PATH);
        TFIDFDriverImpl.TFIDFCSV_PATH = tmpFolder.getAbsolutePath() + File.separator + tiidfCSV.getName();

        File context = new File(TFIDFDriverImpl.CONTEXT_PATH);
        TFIDFDriverImpl.CONTEXT_PATH = tmpFolder.getAbsolutePath() + File.separator + context.getName();

        if (inFile.isFile() && FilenameUtils.getExtension(inFile.getName()).endsWith("avro")) {

            FileUtils.copyFile(inFile, new File(tmpFolder + File.separator + inFile.getName()));
            //            tfidfDriver.executeTFIDF(tmpFolder.getAbsolutePath());
        } else {
            for (File f : inFile.listFiles()) {
                if (FilenameUtils.getExtension(f.getName()).endsWith("avro")) {
                    FileUtils.copyFile(f, new File(tmpFolder + File.separator + f.getName()));
                }
            }
        }

    }

    private static void calculateTermTFIDF(String docPath, String termsFile, String out) throws IOException {
        try {
            ITFIDFDriver tfidfDriver = new TFIDFTermsDriver();
            TFIDFTermsDriver.STOPWORDS_PATH = System.getProperty("stop.words.file");

            if (TFIDFTermsDriver.STOPWORDS_PATH == null) {
                TFIDFTermsDriver.STOPWORDS_PATH = prop.getProperty("stop.words.file",
                        ".." + File.separator + "etc" + File.separator + "stopwords.csv");
            }
            TFIDFTermsDriver.NUM_OF_LINES = System.getProperty("map.reduce.num.of.lines");

            if (TFIDFTermsDriver.NUM_OF_LINES == null) {
                TFIDFTermsDriver.NUM_OF_LINES = prop.getProperty("map.reduce.num.of.lines", "200");
            }
            setTFIDFTermDriverPaths(docPath, out);
            tfidfDriver.executeTFIDF(termsFile);

        } finally {

        }
    }

    private static void apriori(String in, String out) throws IOException {
        String stopWordsPath = System.getProperty("stop.words.file");

        if (stopWordsPath == null) {
            stopWordsPath = prop.getProperty("stop.words.file",
                    ".." + File.separator + "etc" + File.separator + "stopwords.csv");
        }
        DataPrepare dataPrepare = new DataPrepare(in, out, stopWordsPath);
        dataPrepare.execute();
        String taggerPath = System.getProperty("model.path");

        if (taggerPath == null) {
            taggerPath = prop.getProperty("model.path", ".." + File.separator + "etc" + File.separator + "model");
        }

        taggerPath += File.separator + "stanford" + File.separator + "english-left3words-distsim.tagger";

        File fin = new File(out + File.separator + "itemset.csv");
        File fout = new File(out + File.separator + "tmp.csv");
        MaxentTagger tagger = new MaxentTagger(taggerPath);
        try (PrintWriter pw = new PrintWriter(fout)) {
            try (BufferedReader br = new BufferedReader(new FileReader(fin))) {
                for (String text; (text = br.readLine()) != null;) {
                    String term = text.split("/")[0];
                    String tagged = tagger.tagString(term);
                    boolean add = true;
                    if (!tagged.contains("NN") || tagged.contains("RB")) {
                        add = false;
                    }
                    if (add) {
                        pw.print(text + "\n");
                    }
                }
            }
        }
        Files.move(fout, fin);
    }

    private static void setTFIDFTermDriverPaths(String textDocsPath, String out) throws IOException {
        TFIDFTermsDriver.TEXT_FILES_DIR_PATH = textDocsPath;
        TFIDFTermsDriver.OUT = out;
    }

}