eu.edisonproject.classification.main.BatchMain.java Source code

Introduction

Here is the source code for eu.edisonproject.classification.main.BatchMain.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/*
* Copyright 2016 Michele Sparamonti, Spiros Koulouzis
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.edisonproject.classification.main;

import eu.edisonproject.utility.file.MyProperties;
import eu.edisonproject.classification.distance.CosineSimilarityMatrix;
import eu.edisonproject.classification.tfidf.mapreduce.TFIDFDriverImpl;
import eu.edisonproject.utility.commons.ValueComparator;
import eu.edisonproject.utility.file.CSVFileReader;
import eu.edisonproject.utility.file.ConfigHelper;
import eu.edisonproject.utility.file.ReaderFile;
import eu.edisonproject.utility.file.WriterFile;
import eu.edisonproject.utility.text.processing.StanfordLemmatizer;
import eu.edisonproject.utility.text.processing.StopWord;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.util.CharArraySet;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

/**
 *
 * @author Michele Sparamonti (Michele.Sparamonti@eng.it), S. Koulouzis
 */
public class BatchMain {

    public static void main(String[] args) throws Exception {
        try {
            //            args = new String[1];
            //            args[0] = "..";
            //            TestDataFlow.execute(args);
            //            System.exit(0);
            //            TestTFIDF.execute(args);

            Options options = new Options();

            Option operation = new Option("op", "operation", true, "type of operation to perform. "
                    + "To convert txt to avro 'a'.\n" + "For running clasification on avro documents 'c'");
            operation.setRequired(true);
            options.addOption(operation);

            Option input = new Option("i", "input", true, "input path");
            input.setRequired(false);
            options.addOption(input);

            Option output = new Option("o", "output", true, "output file");
            output.setRequired(false);
            options.addOption(output);

            Option competencesVector = new Option("c", "competences-vector", true, "competences vectors");
            competencesVector.setRequired(false);
            options.addOption(competencesVector);

            Option v1 = new Option("v1", "vector1", true, "");
            v1.setRequired(false);
            options.addOption(v1);

            Option v2 = new Option("v2", "vector2", true, "");
            v2.setRequired(false);
            options.addOption(v2);

            Option popertiesFile = new Option("p", "properties", true, "path for a properties file");
            popertiesFile.setRequired(false);
            options.addOption(popertiesFile);

            CommandLineParser parser = new BasicParser();
            CommandLine cmd = parser.parse(options, args);
            String propPath = cmd.getOptionValue("properties");
            MyProperties prop;
            if (propPath == null) {
                prop = ConfigHelper
                        .getProperties(".." + File.separator + "etc" + File.separator + "configure.properties");
            } else {
                prop = ConfigHelper.getProperties(propPath);
            }

            switch (cmd.getOptionValue("operation")) {
            case "a":
                text2Avro(cmd.getOptionValue("input"), cmd.getOptionValue("output"), prop);
                break;
            case "c":
                calculateTFIDF(cmd.getOptionValue("input"), cmd.getOptionValue("output"),
                        cmd.getOptionValue("competences-vector"), prop);
                break;
            case "p":
                //                    -op p -v2 $HOME/Downloads/msc.csv -v1 $HOME/Downloads/job.csv -p $HOME/workspace/E-CO-2/etc/classification.properties
                profile(cmd.getOptionValue("v1"), cmd.getOptionValue("v2"), cmd.getOptionValue("output"));
                break;
            }

        } catch (IllegalArgumentException | ParseException | IOException ex) {
            Logger.getLogger(BatchMain.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    public static void calculateTFIDF(String in, String out, String competencesVectorPath, Properties prop)
            throws IOException {

        try {
            TFIDFDriverImpl tfidfDriver = new TFIDFDriverImpl();

            if (TFIDFDriverImpl.NUM_OF_LINES == null) {
                TFIDFDriverImpl.NUM_OF_LINES = prop.getProperty("map.reduce.num.of.lines", "200");
            }

            TFIDFDriverImpl.STOPWORDS_PATH = System.getProperty("stop.words.file");

            if (TFIDFDriverImpl.STOPWORDS_PATH == null) {
                TFIDFDriverImpl.STOPWORDS_PATH = prop.getProperty("stop.words.file",
                        ".." + File.separator + "etc" + File.separator + "stopwords.csv");
            }
            TFIDFDriverImpl.INPUT_ITEMSET = System.getProperty("itemset.file");
            if (TFIDFDriverImpl.INPUT_ITEMSET == null) {
                TFIDFDriverImpl.INPUT_ITEMSET = prop.getProperty("itemset.file",
                        ".." + File.separator + "etc" + File.separator + "allTerms.csv");
            }

            TFIDFDriverImpl.COMPETENCES_PATH = competencesVectorPath;
            tfidfDriver.OUT = out;
            tfidfDriver.executeTFIDF(in);

        } finally {
        }
    }

    private static void text2Avro(String inputPath, String outputPath, Properties prop) {
        String stopWordsPath = System.getProperty("stop.words.file");

        if (stopWordsPath == null) {
            stopWordsPath = prop.getProperty("stop.words.file",
                    ".." + File.separator + "etc" + File.separator + "stopwords.csv");
        }
        CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
        StopWord cleanStopWord = new StopWord(stopWordArraySet);
        StanfordLemmatizer cleanLemmatisation = new StanfordLemmatizer();
        File filesInDir = new File(inputPath);
        for (File f : filesInDir.listFiles()) {
            if (f.isFile() && FilenameUtils.getExtension(f.getName()).endsWith("txt")) {
                ReaderFile rf = new ReaderFile(f.getAbsolutePath());
                String contents = rf.readFile();
                cleanStopWord.setDescription(contents);
                String cleanCont = cleanStopWord.execute().toLowerCase();
                cleanLemmatisation.setDescription(cleanCont);
                cleanCont = cleanLemmatisation.execute();
                WriterFile wf = new WriterFile(outputPath + File.separator + f.getName());
                wf.writeFile(cleanCont);
            }
        }
        //        IDataPrepare dp = new DataPrepare(inputPath, outputPath, stopWordsPath);
        //        dp.execute();

    }

    private static void profile(String csvFile1, String csvFile2, String output) throws IOException, Exception {
        Map<String, Collection<Double>> cv = CSVFileReader.csvCompetanceToMap(csvFile1, ",", Boolean.TRUE);
        Map<String, Collection<Double>> jobVec = CSVFileReader.csvCompetanceToMap(csvFile2, ",", Boolean.TRUE);
        CosineSimilarityMatrix cosineFunction = new CosineSimilarityMatrix();

        String k1 = cv.keySet().iterator().next();
        Map<String, Double> winners = new HashMap<>();
        for (String k : jobVec.keySet()) {
            Collection<Double> j = jobVec.get(k);
            double d = cosineFunction.computeDistance(cv.get(k1), j);
            //      if (!Double.isNaN(d)) {
            winners.put(k, d);
            //      }
        }
        StringBuilder lines = new StringBuilder();
        ReaderFile rf = new ReaderFile(csvFile1);
        String fileHeader = rf.readFileWithN().split("\n")[0];
        String[] header = fileHeader.split(",");
        lines.append("rank").append(",");
        lines.append(fileHeader);
        lines.append("\n");

        int rank = 0;

        JSONObject cvJson = new JSONObject();
        Collection<Double> vector = cv.get(k1);
        String val = vector.toString().replaceAll("\\[", "").replaceAll("\\]", "");
        lines.append(rank).append(",").append(k1).append(",").append(val).append("\n");
        Iterator<Double> iter = vector.iterator();
        int i = 0;
        cvJson.put(header[i++], k1);
        while (iter.hasNext()) {
            String key = header[i++];
            Double value = iter.next();
            cvJson.put(key, value);
        }
        cvJson.put("rank", rank);
        JSONArray profileJson = new JSONArray();
        profileJson.add(cvJson);
        ValueComparator bvc = new ValueComparator(winners);
        Map<String, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(winners);

        for (String k : sorted_map.keySet()) {
            JSONObject jobJason = new JSONObject();
            rank++;
            vector = jobVec.get(k);
            val = vector.toString().replaceAll("\\[", "").replaceAll("\\]", "");
            lines.append(rank).append(",").append(k).append(",").append(val).append("\n");

            i = 0;
            jobJason.put(header[i++], k);
            iter = vector.iterator();
            while (iter.hasNext()) {
                String key = header[i++];
                Double value = iter.next();
                jobJason.put(key, value);
            }
            jobJason.put("rank", rank);
            profileJson.add(jobJason);
        }
        WriterFile wf = new WriterFile(output + File.separator + "result.csv");
        wf.writeFile(lines.toString());

        wf = new WriterFile(output + File.separator + "result.json");
        wf.writeFile(profileJson.toJSONString());
    }
}