eu.edisonproject.classification.tfidf.mapreduce.TFIDFDriverImpl.java Source code

Java tutorial

Introduction

Here is the source code for eu.edisonproject.classification.tfidf.mapreduce.TFIDFDriverImpl.java

Source

/*
 * Copyright 2016 Michele Sparamonti & Spiros Koulouzis.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.edisonproject.classification.tfidf.mapreduce;

import eu.edisonproject.classification.prepare.controller.DataPrepare;
import eu.edisonproject.classification.prepare.controller.IDataPrepare;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.UUID;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;

/**
 *
 * @author Michele Sparamonti (michele.sparamonti@eng.it)
 */
public class TFIDFDriverImpl {

    //where to read the frequent itemset
    public static String INPUT_ITEMSET;
    public static String COMPETENCES_PATH;

    //    private String finalOutputPath;
    public static String NUM_OF_LINES;
    public static String STOPWORDS_PATH = ".." + File.separator + "etc" + File.separator + "stopwords.csv";
    public String OUT;

    /**
     *
     * @param inputPath
     */
    public void executeTFIDF(String inputPath) {
        try {
            File items = new File(INPUT_ITEMSET);
            if (!items.exists()) {
                throw new IOException(items.getAbsoluteFile() + " not found");
            }

            String OUTPUT_PATH1 = System.currentTimeMillis() + "_" + UUID.randomUUID()
                    + "-TFIDFDriverImpl-1-word-freq";

            if (items.length() < 200000000) {
                String AVRO_FILE = System.currentTimeMillis() + "_" + UUID.randomUUID() + "-TFIDFDriverImpl-avro";
                Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "Starting text2Avro");
                text2Avro(inputPath, AVRO_FILE);

                Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO,
                        "Starting WordFrequencyInDocDriver: {0},{1},{2},{3},{4}",
                        new Object[] { AVRO_FILE, OUTPUT_PATH1, INPUT_ITEMSET, NUM_OF_LINES, STOPWORDS_PATH });
                String[] args1 = { AVRO_FILE, OUTPUT_PATH1, INPUT_ITEMSET, STOPWORDS_PATH };
                ToolRunner.run(new WordFrequencyInDocDriver(), args1);
            } else {
                Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "Starting TermWordFrequency");
                String[] args1 = { INPUT_ITEMSET, OUTPUT_PATH1, inputPath, STOPWORDS_PATH, NUM_OF_LINES };
                ToolRunner.run(new TermWordFrequency(), args1);
            }
            String OUTPUT_PATH2 = System.currentTimeMillis() + "_" + UUID.randomUUID()
                    + "-TFIDFDriverImpl-2-word-counts";
            ;
            String[] args2 = { OUTPUT_PATH1, OUTPUT_PATH2 };
            ToolRunner.run(new WordCountsForDocsDriver(), args2);

            File docs = new File(inputPath);
            File[] files = docs.listFiles(new FilenameFilter() {
                @Override
                public boolean accept(File dir, String name) {
                    return name.toLowerCase().endsWith(".txt");
                }
            });
            Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "docs:{0}", docs.getAbsolutePath());
            int numberOfDocuments = files.length;
            String OUTPUT_PATH3 = System.currentTimeMillis() + "_" + UUID.randomUUID()
                    + "-TFIDFDriverImpl-3-tf-idf";
            String[] args3 = { OUTPUT_PATH2, OUTPUT_PATH3, String.valueOf(numberOfDocuments) };
            ToolRunner.run(new WordsInCorpusTFIDFDriver(), args3);

            StringBuilder fileNames = new StringBuilder();
            String prefix = "";
            for (File name : files) {
                if (name.isFile() && FilenameUtils.getExtension(name.getName()).endsWith("txt")) {
                    fileNames.append(prefix);
                    prefix = ",";
                    fileNames.append(FilenameUtils.removeExtension(name.getName()).replaceAll("_", ""));
                }
            }
            String OUTPUT_PATH4 = System.currentTimeMillis() + "_" + UUID.randomUUID()
                    + "-TFIDFDriverImpl-4-distances";
            String[] args4 = { OUTPUT_PATH3, OUTPUT_PATH4, COMPETENCES_PATH, fileNames.toString() };
            Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "args4:{0}", Arrays.toString(args4));
            ToolRunner.run(new CompetencesDistanceDriver(), args4);

            Configuration conf = new Configuration();
            FileSystem fs = FileSystem.get(conf);
            Path hdfsRes = new Path(OUTPUT_PATH4);
            FileStatus[] results = fs.listStatus(hdfsRes);
            for (FileStatus s : results) {
                Path dest = new Path(OUT + "/" + s.getPath().getName());
                Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "Copy: {0} to: {1}",
                        new Object[] { s.getPath(), dest });
                fs.copyToLocalFile(s.getPath(), dest);
            }
            fs.delete(hdfsRes, true);

        } catch (Exception ex) {
            Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    //    public void readDistancesOutputAndPrintCSV() {
    //        ReaderFile rf = new ReaderFile(OUTPUT_PATH4 + File.separator + "part-r-00000");
    //        String text = rf.readFileWithN();
    //        String[] textLine = text.split("\n");
    //        WriterFile fileWriter = new WriterFile(finalOutputPath);
    //        String textToPrint = "";
    //        for (String line : textLine) {
    //            String[] keyValue = line.split("\t");
    //            String[] field = keyValue[0].split("@");
    //            String[] distances = keyValue[1].split(";");
    //            textToPrint += field[1] + ";" + field[0] + ";" + field[2] + ";";
    //            for (String d : distances) {
    //                textToPrint += d + ";";
    //            }
    //            textToPrint += "\n";
    //        }
    //        fileWriter.writeFile(textToPrint);
    //    }
    public static void text2Avro(String inputPath, String outputPath) {
        Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.INFO, "Start");
        try {
            File out = new File(outputPath);
            out.getAbsoluteFile().delete();
            out.getAbsoluteFile().mkdirs();

            IDataPrepare dp = new DataPrepare(inputPath, outputPath, STOPWORDS_PATH);
            dp.execute();
        } catch (Exception ex) {
            Logger.getLogger(TFIDFDriverImpl.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}