HoopRemoteTask.java Source code

Introduction

Here is the source code for HoopRemoteTask.java
Source

/** 
 * Author: Martin van Velsen <vvelsen@cs.cmu.edu>
 * 
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as 
 *  published by the Free Software Foundation, either version 3 of the 
 *  License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * By default uses the following:
 * 
 *    stopwords: a,an,and,are,as,at,for,i,if,in,is,it,of,on,so,that,the,to
 *  garbage:   -$%^&*()!@# (when found as a complete term)
 *    casefolding: yes (can be turned off but is not recommended)
 *    stemming: yes
 * 
 */

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import edu.cmu.cs.in.search.HoopDataSet;
import edu.cmu.cs.in.base.HoopRoot;
import edu.cmu.cs.in.base.HoopStringTools;
//import edu.cmu.cs.in.base.io.HoopFileManager;
import edu.cmu.cs.in.base.HoopLink;
import edu.cmu.cs.in.stats.HoopPerformanceMeasure;
import edu.cmu.cs.in.stats.HoopStatistics;
import edu.cmu.cs.in.hadoop.HoopHadoopReporter;
import edu.cmu.cs.in.hadoop.HoopInvertedListMapper;
import edu.cmu.cs.in.hadoop.HoopInvertedListReducer;
import edu.cmu.cs.in.hadoop.HoopShardedOutputFormat;
import edu.cmu.cs.in.hadoop.HoopWordCountMapper;
import edu.cmu.cs.in.hadoop.HoopWordCountReducer;
import edu.cmu.cs.in.hadoop.HoopWholeFileInputFormat;

public class HoopRemoteTask extends HoopHadoopReporter {
    public static final String DATE_FORMAT_NOW = "yyyy-MM-dd HH:mm:ss";
    private HoopDataSet data = null;
    public static FileSystem hdfs = null;

    /**
     *
     */
    public HoopRemoteTask() {
        setClassName("HoopRemoteTask");
        debug("HoopRemoteTask ()");
        HoopRoot.debug("Hoop", "Running on: " + HoopHadoopReporter.getMachineName());
    }

    /**
    *
    */
    private Boolean indexDocuments() {
        debug("indexDocuments ()");

        /*
        data=new HoopDataSet ();
        data.loadDocuments(HoopLink.datapath);       
         data.printStats ();
         */

        return (true);
    }

    /**
    *
    */
    private static void usage() {
        System.out.println("Usage: choose either one of:");
        System.out.println(" 1) Locally analyzing wiki files");
        System.out.println(" 2) Running the MapReduce task on hadoop");
        System.out.println(" 3) Post processing/analyzing hadoop output");
        System.out.println(" ");
        System.out.println("Usage 1): -classpath HoopRemoteTask.jar HoopRemoteTask <options>");
        System.out.println(" <options>:");
        System.out.println("   -task wordcount|invert");
        System.out.println("   -splitsize <nr> (Default: 10)"); // 10, 10K or 60K       
        System.out.println("   -shards <nr> (Default: 1)");
        System.out.println("   -shardcreate hdfs|partitions|mos|none (Default: hdfs)");
        System.out.println("   -stopwords yes/no (Default: yes)");
        System.out.println("   -stemming yes/no (Default: yes)");
        System.out.println("   -minstemsize <Number> (Default: 4)");
        System.out.println("   -cleanoutput yes/no (Default: no");
        System.out.println("   -datapath <path>");
        System.out.println("   -outputpath <path>");
        System.out.println("   -monitorhost <ip|hostname>");
        System.out.println("Example:");
        System.out.println(
                " java -classpath HoopRemoteTask.jar HoopRemoteTask -cleanoutput yes -datapath /Root/DataSet/Wiki-10 -outputpath Root/Results/");
        System.out.println(" ");
        System.out.println("Usage 2): -classpath HoopRemoteTask.jar HoopRemoteTask -hadoop <options>");
        System.out.println(" <options>:");
        System.out.println("   -splitsize <nr> (Default: 10)"); // 10, 10K or 60K       
        System.out.println("   -shards <nr> (Default: 1)");
        System.out.println("   -shardcreate hdfs|partitions|mos|none (Default: hdfs)");
        System.out.println("   -dbglocal yes/no (Default: no)");
        System.out.println("   -stopwords yes/no (Default: yes)");
        System.out.println("   -stemming yes/no (Default: yes)");
        System.out.println("   -minstemsize <Number> (Default: 4)");
        System.out.println("   -datapath <path>");
        System.out.println("   -outputpath <path>");
        System.out.println("   -monitorhost <ip|hostname>");
        System.out.println("Example:");
        System.out.println(
                " java -classpath HoopRemoteTask.jar HoopRemoteTask -datapath /user/hduser/Wiki-10 -outputpath /user/hduser/output");
        System.out.println(" ");
        System.out.println("Usage 3): -classpath HoopRemoteTask.jar HoopSorterStats <options>");
        System.out.println(" <options>:");
        System.out.println("   -inputfile <path-to-file");
        System.out.println("   -gettop yes/no (Default: no) Shows the top 25 frequencies for the selected input");
        System.out.println(
                "   -getselected yes/no (Default: no) Uses an internal table of selected words to show frequences for (see homework)");
        System.out.println(
                "   -getshowrare yes/no (Default: no) For the frequencies 1,2,3,4,5 create output files using the output path and list the terms for each");
        System.out.println(" ");
        System.out.println("Example:");
        System.out.println(
                " java -classpath HoopRemoteTask.jar HoopSorterStats -getshowrare yes -inputfile Root/Results/MapReduced-60k.txt -outputpath Root/Results/");
    }

    /**
    *
    */
    private static void dbg(String aMessage) {
        HoopRoot.debug("HoopRemoteTask", aMessage);
    }

    /**
    *
    */
    private static Boolean parseArgs(String args[]) {
        dbg("parseArgs ()");

        if (args.length < 3) {
            return (false);
        }

        for (int i = 0; i < args.length; i++) {
            if (args[i].compareTo("-casefold") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.casefold = true;
                else
                    HoopLink.casefold = false;
            }

            if (args[i].compareTo("-dbglocal") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.dbglocal = true;
                else
                    HoopLink.dbglocal = false;
            }

            if (args[i].compareTo("-stopwords") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.stopwords = true;
                else
                    HoopLink.stopwords = false;
            }

            if (args[i].compareTo("-stemming") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.stemming = true;
                else
                    HoopLink.stemming = false;
            }

            if (args[i].compareTo("-minstemsize") == 0) {
                String stemsizepre = args[i + 1];
                HoopLink.minstemsize = Integer.parseInt(stemsizepre);
            }

            if (args[i].compareTo("-cleanoutput") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.cleanoutput = true;
                else
                    HoopLink.stemming = false;
            }

            if (args[i].compareTo("-datapath") == 0) {
                HoopLink.datapath = args[i + 1];
            }

            if (args[i].compareTo("-outputpath") == 0) {
                HoopLink.outputpath = args[i + 1];
            }

            if (args[i].compareTo("-hadoop") == 0) {
                HoopLink.useHadoop = true;
            }

            if (args[i].compareTo("-task") == 0) {
                String taskpre = args[i + 1];
                HoopLink.task = taskpre.toLowerCase();
            }

            if (args[i].compareTo("-splitsize") == 0) {
                String sizepre = args[i + 1];
                HoopLink.splitsize = Long.parseLong(sizepre);
            }

            if (args[i].compareTo("-shards") == 0) {
                String shardpre = args[i + 1];
                HoopLink.nrshards = Integer.parseInt(shardpre);
            }

            if (args[i].compareTo("-shardtype") == 0) {
                HoopLink.shardtype = args[i + 1];
            }

            if (args[i].compareTo("-shardcreate") == 0) {
                String createpre = args[i + 1];
                HoopLink.shardcreate = createpre.toLowerCase();
            }

            if (args[i].compareTo("-postonly") == 0) {
                if (args[i + 1].equals("yes") == true)
                    HoopLink.postonly = true;
                else
                    HoopLink.postonly = false;
            }

            if (args[i].compareTo("-monitorhost") == 0) {
                String createhost = args[i + 1];
                HoopLink.monitorHost = createhost.toLowerCase();
            }
        }

        return (true);
    }

    /**
    *
    */
    public static void postOnly() {
        dbg("postOnly ()");

        // First get the amount of items in the file ...

        BufferedReader reader = null;

        //HoopFileManager fManager=new HoopFileManager ();

        String content = HoopLink.fManager.loadContents(HoopLink.datapath);

        long termCount = HoopStringTools.dataToLines(content).size();

        dbg("Sharding " + termCount + " terms ...");

        // Then shard as usual

        try {
            reader = new BufferedReader(new FileReader(HoopLink.datapath));
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        String thisLine = "";
        int count = 0;
        int split = Math.round(termCount / HoopLink.nrshards);
        int partition = 0;
        Writer out = null;

        try {
            out = new BufferedWriter(
                    new FileWriter(HoopLink.outputpath + "/partition-" + partition + "-00000.txt"));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        dbg("Sharding to: " + HoopLink.outputpath + "/partition-" + partition + "-00000.txt");

        if (out != null) {
            try {
                while ((thisLine = reader.readLine()) != null) {
                    count++;

                    if (count > split) {
                        partition++;

                        out = new BufferedWriter(
                                new FileWriter(HoopLink.outputpath + "/partition-" + partition + "-00000.txt"));

                        dbg("Sharding to: " + HoopLink.outputpath + "/partition-" + partition + "-00000.txt");

                        split++;
                        count = 0;
                    }

                    out.write(thisLine);
                    out.write("\n");
                }
            } catch (UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else
            dbg("Error: unable to open output file");

        dbg("All done!");
    }

    /**
    *
    */
    public static int countTerms(Configuration conf) {
        dbg("postProcess ()");

        int count = 0;

        String output = conf.get("mapred.output.dir");

        if (output != null) {
            if (output.isEmpty() == true)
                output = HoopLink.outputpath;
        } else
            output = HoopLink.outputpath;

        Path inFile = new Path(output + "/part-r-00000");
        FSDataInputStream in = null;

        @SuppressWarnings("unused")
        String thisLine = null;

        try {
            in = HoopRemoteTask.hdfs.open(inFile);

            BufferedReader reader = new BufferedReader(new InputStreamReader(in));

            while ((thisLine = reader.readLine()) != null) {
                count++;
            }

            in.close();
        } catch (IOException e) {
            e.printStackTrace();
            dbg("Error opening file in HDFS");
        }

        return (count);
    }

    /**
    *
    */
    public static void postProcess(Configuration conf) {
        dbg("postProcess ()");

        if (HoopLink.nrshards == 1) {
            dbg("Only 1 shard needed, skipping post processing");
            return;
        }

        if (HoopLink.shardcreate.equals("mos") == true) {
            dbg("We shouldn't be pos-processing since the HoopShardedOutputFormat class already did this");
            return;
        }

        if (HoopLink.shardcreate.equals("hdfs") == true) {
            dbg("Starting shard post-process task ...");

            int termCount = countTerms(conf);

            String output = conf.get("mapred.output.dir");

            if (output != null) {
                if (output.isEmpty() == true)
                    output = HoopLink.outputpath;
            } else
                output = HoopLink.outputpath;

            dbg("Post processing " + termCount + " items in: " + output);

            Path inFile = new Path(output + "/part-r-00000");
            Path outFile = null;
            FSDataInputStream in = null;
            FSDataOutputStream out = null;

            try {
                in = HoopRemoteTask.hdfs.open(inFile);

                BufferedReader reader = new BufferedReader(new InputStreamReader(in));

                String thisLine;

                int count = 0;
                int split = Math.round(termCount / HoopLink.nrshards);
                int partition = 0;

                outFile = new Path(output + "/partition-" + partition + "-00000.txt");
                out = HoopRemoteTask.hdfs.create(outFile);

                if (out != null) {
                    while ((thisLine = reader.readLine()) != null) {
                        StringBuffer formatted = new StringBuffer();
                        formatted.append(thisLine);
                        formatted.append("\n");

                        count++;

                        if (count > split) {
                            out.close();

                            partition++;

                            outFile = new Path(output + "/partition-" + partition + "-00000.txt");
                            out = HoopRemoteTask.hdfs.create(outFile);

                            split++;
                            count = 0;
                        }

                        byte[] utf8Bytes = formatted.toString().getBytes("UTF8");
                        // We get an additional 0 because of Java string encoding. leave it out!
                        out.write(utf8Bytes);
                    }

                    if (in != null)
                        in.close();

                    if (out != null)
                        out.close();
                } else
                    dbg("Error: unable to open output file");

            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            dbg("Starting rudimentary sharding into " + HoopLink.nrshards);

            if (in != null) {

                try {
                    in.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }

        }

        HoopStatistics stats = new HoopStatistics();
        String results = stats.printStatistics(null);
        dbg(results);
    }

    /**
    *
    */
    public static void showTimeStamp() {
        Calendar cal = Calendar.getInstance();
        SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT_NOW);
        dbg("Time: " + sdf.format(cal.getTime()));
    }

    /**
     * 
     */
    public static void transferConf(Configuration conf) {
        conf.set("useHadoop", new Boolean(HoopLink.useHadoop).toString());
        conf.set("casefold", new Boolean(HoopLink.casefold).toString());
        conf.set("stopwords", new Boolean(HoopLink.stopwords).toString());
        conf.set("stemming", new Boolean(HoopLink.stemming).toString());
        conf.set("cleanoutput", new Boolean(HoopLink.cleanoutput).toString());
        conf.set("dbglocal", new Boolean(HoopLink.dbglocal).toString());
        conf.set("minstemsize", new Integer(HoopLink.minstemsize).toString());
        conf.set("splitsize", new Long(HoopLink.splitsize).toString());
        conf.set("nrshards", new Integer(HoopLink.nrshards).toString());
        conf.set("shardtype", HoopLink.shardtype);
        conf.set("shardcount", new Integer(HoopLink.shardcount).toString());
        conf.set("shardcreate", new Boolean(HoopLink.shardcreate).toString());
        conf.set("task", HoopLink.task);
        conf.set("monitorHost", HoopLink.monitorHost);
    }

    /**
    *
    */
    public static void main(String args[]) throws Exception {
        // run the HoopLink constructor; We need this to have a global settings registry       
        @SuppressWarnings("unused")
        HoopLink link = new HoopLink();

        dbg("main ()");

        showTimeStamp();

        /**
         * I've taken out the statistics portion since it relies on code that isn't distributed
         * The next version will have this solved. I might try the solution in:
         * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments
         * Although chances are I will switch to using Hoop to collect much better performance and distribution 
         * statistics. See Hoop.java for more information
         */

        HoopPerformanceMeasure metrics = new HoopPerformanceMeasure();
        metrics.setMarker("main");
        HoopLink.metrics.getDataSet().add(metrics);

        if (parseArgs(args) == false) {
            usage();
            return;
        }

        if (HoopLink.postonly == true) {
            postOnly();
            return;
        }

        if (HoopLink.task.equals("none") == true) {
            dbg("No task defined, please use the commandline option -task <task>");
            return;
        }

        dbg("Starting system ...");

        HoopRemoteTask driver = new HoopRemoteTask();

        if (HoopLink.useHadoop == false) {
            dbg("Starting built-in mapper ...");

            driver.indexDocuments();
        } else {
            dbg("Starting hadoop job ...");

            Configuration conf = new Configuration();

            // TRANSFER SETTHoopGS FROM HoopLink to Configuration!!!

            transferConf(conf);

            // Now we're feeling much better

            HoopRemoteTask.hdfs = FileSystem.get(conf);

            if (HoopLink.dbglocal == true) {
                dbg("Enabling local debugging ...");
                conf.set("mapred.job.tracker", "local");
            } else
                dbg("Disabling local debugging");

            JobConf job = new JobConf(conf, HoopRemoteTask.class);

            job.setJobName(driver.getClassName());

            driver.setJob(job);

            @SuppressWarnings("unused")
            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

            job.setJarByClass(HoopRemoteTask.class);

            if (HoopLink.task.equals("invert") == true) {
                dbg("Configuring job for invert task ...");

                job.setReducerClass(HoopInvertedListReducer.class);
                job.setMapperClass(HoopInvertedListMapper.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
            }

            if (HoopLink.task.equals("wordcount") == true) {
                dbg("Configuring job for wordcount task ...");

                job.setReducerClass(HoopWordCountReducer.class);
                job.setMapperClass(HoopWordCountMapper.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(IntWritable.class);
            }

            dbg("Using input path: " + HoopLink.datapath);
            dbg("Using output path: " + HoopLink.outputpath);

            FileInputFormat.addInputPath(job, new Path(HoopLink.datapath));
            FileOutputFormat.setOutputPath(job, new Path(HoopLink.outputpath));

            job.setInputFormat(HoopWholeFileInputFormat.class);

            if ((HoopLink.shardcreate.equals("mos") == true) && (HoopLink.nrshards > 1)) {
                dbg("Setting output to sharded output streams class ...");

                job.setOutputFormat(HoopShardedOutputFormat.class);
            } else
                job.setOutputFormat(TextOutputFormat.class);

            /**
             * Temporarily commented out for testing purposes
             */

            //job.setPartitionerClass (HoopPartitioner.class);                      

            driver.register("Main");

            JobClient.runJob(job);

            postProcess(conf);
        }

        showTimeStamp();

        metrics.closeMarker();
        long timeTaken = metrics.getYValue();
        //long timeTaken=metrics.getMarkerRaw ();
        metrics.printMetrics(timeTaken);

        driver.unregister();

        /**
         * I've taken out the statistics portion since it relies on code that isn't distributed
         * The next version will have this solved. I might try the solution in:
         * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments
         * Although chances are I will switch to using Hoop to collect much better performance and distribution 
         * statistics. See Hoop.java for more information
         */
        //stats.calcStatistics();
        //dbg (stats.printStatistics());
    }
}