edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java Source code

Introduction

Here is the source code for edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java
Source

/**
 * Software License, Version 1.0
 * 
 * Copyright 2003 The Trustees of Indiana University.  All rights reserved.
 * 
 *
 *Redistribution and use in source and binary forms, with or without 
 *modification, are permitted provided that the following conditions are met:
 *
 *1) All redistributions of source code must retain the above copyright notice,
 * the list of authors in the original source code, this list of conditions and
 * the disclaimer listed in this license;
 *2) All redistributions in binary form must reproduce the above copyright 
 * notice, this list of conditions and the disclaimer listed in this license in
 * the documentation and/or other materials provided with the distribution;
 *3) Any documentation included with all redistributions must include the 
 * following acknowledgement:
 *
 *"This product includes software developed by the Community Grids Lab. For 
 * further information contact the Community Grids Lab at 
 * http://communitygrids.iu.edu/."
 *
 * Alternatively, this acknowledgement may appear in the software itself, and 
 * wherever such third-party acknowledgments normally appear.
 * 
 *4) The name Indiana University or Community Grids Lab or NaradaBrokering, 
 * shall not be used to endorse or promote products derived from this software 
 * without prior written permission from Indiana University.  For written 
 * permission, please contact the Advanced Research and Technology Institute 
 * ("ARTI") at 351 West 10th Street, Indianapolis, Indiana 46202.
 *5) Products derived from this software may not be called NaradaBrokering, 
 * nor may Indiana University or Community Grids Lab or NaradaBrokering appear
 * in their name, without prior written permission of ARTI.
 * 
 *
 * Indiana University provides no reassurances that the source code provided 
 * does not infringe the patent or any other intellectual property rights of 
 * any other entity.  Indiana University disclaims any liability to any 
 * recipient for claims brought by any other entity based on infringement of 
 * intellectual property rights or otherwise.  
 *
 *LICENSEE UNDERSTANDS THAT SOFTWARE IS PROVIDED "AS IS" FOR WHICH NO 
 *WARRANTIES AS TO CAPABILITIES OR ACCURACY ARE MADE. INDIANA UNIVERSITY GIVES
 *NO WARRANTIES AND MAKES NO REPRESENTATION THAT SOFTWARE IS FREE OF 
 *INFRINGEMENT OF THIRD PARTY PATENT, COPYRIGHT, OR OTHER PROPRIETARY RIGHTS. 
 *INDIANA UNIVERSITY MAKES NO WARRANTIES THAT SOFTWARE IS FREE FROM "BUGS", 
 *"VIRUSES", "TROJAN HORSES", "TRAP DOORS", "WORMS", OR OTHER HARMFUL CODE.  
 *LICENSEE ASSUMES THE ENTIRE RISK AS TO THE PERFORMANCE OF SOFTWARE AND/OR 
 *ASSOCIATED MATERIALS, AND TO THE PERFORMANCE AND VALIDITY OF INFORMATION 
 *GENERATED USING SOFTWARE.
 */

package edu.indiana.soic.ts.mapreduce.pwd;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

import edu.indiana.soic.ts.utils.TSConfiguration;
import edu.indiana.soic.ts.utils.Utils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PairWiseDistance {
    private static final Logger LOG = LoggerFactory.getLogger(PairWiseDistance.class);
    private int blockSize;
    private String distFunc;
    private String interDistDir;
    private String distDir;
    private String vectDir;

    public static void main(String[] args) throws Exception {
        PairWiseDistance pwd = new PairWiseDistance();
        pwd.configure(args);
        pwd.submitJob();
    }

    public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir)
            throws Exception {
        /* input parameters */
        LOG.info(sequenceFileFullPath);
        Job job = new Job(conf, "Pairwise-calc-" + sequenceFile);

        /* create the base dir for this job. Delete and recreates if it exists */
        Path hdMainDir = new Path(distDir + "/" + sequenceFile);
        FileSystem fs = FileSystem.get(conf);
        fs.delete(hdMainDir, true);
        Path hdInputDir = new Path(hdMainDir, "data");
        if (!fs.mkdirs(hdInputDir)) {
            throw new IOException("Mkdirs failed to create " + hdInputDir.toString());
        }

        int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs);
        int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize);
        int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2;
        LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :"
                + blockSize);

        // Retrieving the configuration form the job to set the properties
        // Setting properties to the original conf does not work (possible
        // Hadoop bug)
        Configuration jobConf = job.getConfiguration();

        // Input dir in HDFS. Create this in newly created job base dir
        Path inputDir = new Path(hdMainDir, "input");
        if (!fs.mkdirs(inputDir)) {
            throw new IOException("Mkdirs failed to create " + inputDir.toString());
        }

        Long dataPartitionStartTime = System.nanoTime();
        partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir);

        distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions);

        long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000;
        LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime);

        // Output dir in HDFS
        Path hdOutDir = new Path(hdMainDir, "out");

        jobConf.setInt(Constants.BLOCK_SIZE, blockSize);
        jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions);
        jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences);
        jobConf.set(Constants.DIST_FUNC, distFunc);

        job.setJarByClass(PairWiseDistance.class);
        job.setMapperClass(SWGMap.class);
        job.setReducerClass(SWGReduce.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(SWGWritable.class);
        FileInputFormat.setInputPaths(job, hdInputDir);
        FileOutputFormat.setOutputPath(job, hdOutDir);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(noOfDivisions);

        long startTime = System.currentTimeMillis();
        int exitStatus = job.waitForCompletion(true) ? 0 : 1;
        double executionTime = (System.currentTimeMillis() - startTime) / 1000.0;
        LOG.info("Job Finished in " + executionTime + " seconds");
        LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t"
                + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir);

        return exitStatus;
    }

    public void configure(String[] args) {
        String configFile = Utils.getConfigurationFile(args);
        TSConfiguration tsConfiguration = new TSConfiguration(configFile);
        Map tsConf = tsConfiguration.getConf();

        this.blockSize = (int) tsConf.get(TSConfiguration.MATRIX_BLOCK_SIZE);
        this.distFunc = (String) tsConf.get(TSConfiguration.DISTANCE_FUNCTION);
        this.interDistDir = tsConfiguration.getInterMediateDistanceDir();
        this.distDir = tsConfiguration.getDistDir();
        this.vectDir = tsConfiguration.getVectorDir();
    }

    public void submitJob() throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] status = fs.listStatus(new Path(vectDir));
        for (int i = 0; i < status.length; i++) {
            String sequenceFile = status[i].getPath().getName();
            String sequenceFileFullPath = vectDir + "/" + sequenceFile;
            try {
                execJob(conf, sequenceFileFullPath, sequenceFile, interDistDir);
                concatOutput(conf, sequenceFile, interDistDir, distDir);
            } catch (Exception e) {
                String message = "Failed to executed PWD calculation:" + sequenceFileFullPath + " " + interDistDir;
                LOG.info(message, e);
                throw new RuntimeException(message);
            }
        }
    }

    private class OutFile implements Comparable<OutFile> {
        int no;
        String file;

        public OutFile(int no, String file) {
            this.no = no;
            this.file = file;
        }

        @Override
        public int compareTo(OutFile o) {
            return o.no - this.no;
        }
    }

    public void concatOutput(Configuration conf, String sequenceFile, String distDirIntermediate, String distDir)
            throws IOException {
        FileSystem fs = FileSystem.get(conf);
        Path outDir = new Path(distDirIntermediate + "/" + sequenceFile + "/out");
        FileStatus[] status = fs.listStatus(outDir);
        List<OutFile> outFiles = new ArrayList<OutFile>();
        for (int i = 0; i < status.length; i++) {
            String name = status[i].getPath().getName();
            String split[] = name.split("_");
            if (split.length > 2 && split[0].equals("row")) {
                OutFile o = new OutFile(Integer.parseInt(split[1]), name);
                outFiles.add(o);
            }
        }

        Collections.sort(outFiles);
        String destFile = distDir + "/" + sequenceFile;
        Path outFile = new Path(destFile);
        FSDataOutputStream outputStream = fs.create(outFile);
        for (OutFile o : outFiles) {
            Path inFile = new Path(outDir, o.file);
            FSDataInputStream inputStream = fs.open(inFile);
            IOUtils.copy(inputStream, outputStream);
            inputStream.close();
        }
        outputStream.flush();
        outputStream.close();
    }

    private void distributeData(int blockSize, Configuration conf, FileSystem fs, Path hdInputDir,
            int noOfDivisions) throws IOException {
        // Writing block meta data to for each block in a separate file so that
        // Hadoop will create separate Map tasks for each block..
        // Key : block number
        // Value: row#column#isDiagonal#base_file_name
        // TODO : find a better way to do this.
        for (int row = 0; row < noOfDivisions; row++) {
            for (int column = 0; column < noOfDivisions; column++) {
                // using the load balancing algorithm to select the blocks
                // include the diagonal blocks as they are blocks, not
                // individual pairs
                if (((row >= column) & ((row + column) % 2 == 0)) | ((row <= column) & ((row + column) % 2 == 1))) {
                    Path vFile = new Path(hdInputDir, "data_file_" + row + "_" + column);
                    SequenceFile.Writer vWriter = SequenceFile.createWriter(fs, conf, vFile, LongWritable.class,
                            Text.class, CompressionType.NONE);

                    boolean isDiagonal = false;
                    if (row == column) {
                        isDiagonal = true;
                    }
                    String value = row + Constants.BREAK + column + Constants.BREAK + isDiagonal + Constants.BREAK
                            + Constants.HDFS_SEQ_FILENAME;
                    vWriter.append(new LongWritable(row * blockSize + column), new Text(value));
                    vWriter.close();
                }
            }
        }
    }

    private int getNoOfSequences(String sequenceFile, FileSystem fs)
            throws FileNotFoundException, IOException, URISyntaxException {
        Path path = new Path(sequenceFile);
        int count = 0;
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path)));
        while ((bufferedReader.readLine()) != null) {
            count++;
        }
        bufferedReader.close();
        return count;
    }

    private void partitionData(String sequenceFile, int noOfSequences, int blockSize, FileSystem fs,
            int noOfDivisions, Configuration jobConf, Path inputDir)
            throws FileNotFoundException, IOException, URISyntaxException {
        // Break the sequences file in to parts based on the block size. Stores
        // the parts in HDFS and add them to the Hadoop distributed cache.
        Path path = new Path(sequenceFile);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path)));

        LOG.info("noOfDivisions : " + noOfDivisions);
        LOG.info("blockSize : " + blockSize);
        for (int partNo = 0; partNo < noOfDivisions; partNo++) {
            //
            String filePartName = Constants.HDFS_SEQ_FILENAME + "_" + partNo;
            Path inputFilePart = new Path(inputDir, filePartName);
            OutputStream partOutStream = fs.create(inputFilePart);
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(partOutStream));

            for (int sequenceIndex = 0; ((sequenceIndex < blockSize)
                    & (sequenceIndex + (partNo * blockSize) < noOfSequences)); sequenceIndex++) {
                String line;
                line = bufferedReader.readLine();
                if (line == null) {
                    throw new IOException("Cannot read the sequence from input file.");
                }
                // write the sequence name
                bufferedWriter.write(line);
                bufferedWriter.newLine();
            }
            bufferedWriter.flush();
            bufferedWriter.close();
            // Adding the sequences file to Hadoop cache
            URI cFileURI = new URI(inputFilePart.toUri() + "#" + filePartName);
            DistributedCache.addCacheFile(cFileURI, jobConf);
            DistributedCache.createSymlink(jobConf);
        }
    }
}