INDIANA UNIVERSITY GIVES *NO WARRANTIES AND MAKES NO REPRESENTATION THAT SOFTWARE IS FREE OF *INFRINGEMENT OF THIRD PARTY PATENT, COPYRIGHT, OR OTHER PROPRIETARY RIGHTS. *INDIANA UNIVERSITY MAKES NO WARRANTIES THAT SOFTWARE IS FREE FROM "BUGS", *"VIRUSES", "TROJAN HORSES", "TRAP DOORS", "WORMS", OR OTHER HARMFUL CODE. *LICENSEE ASSUMES THE ENTIRE RISK AS TO THE PERFORMANCE OF SOFTWARE AND/OR *ASSOCIATED MATERIALS, AND TO THE PERFORMANCE AND VALIDITY OF INFORMATION *GENERATED USING SOFTWARE. */ package msc.fall2015.stock.kmeans.hbase.mapreduce.pwd; import; import; import; import; import; import; import; import; import; import; import; import; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import; import; import; import; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * @author Thilina Gunarathne ( */ public class PairWiseAlignment extends Configured implements Tool { public static void main(String[] args) throws Exception { int res = Configuration(), new PairWiseAlignment(), args); System.exit(res); } public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: <sequence_file> <sequence_count> <block_size> <weight>"); System.exit(2); } /* input parameters */ String sequenceFile = args[1]; System.out.println(sequenceFile); // we are limited to int's as java loops supports only them int noOfSequences = Integer.parseInt(args[2]); // int noOfSequences = 7322; int blockSize = Integer.parseInt(args[3]); boolean weightCalculate = Boolean.parseBoolean(args[4]); // int blockSize = 7322; Configuration conf = new Configuration(); Job job = new Job(conf, "Pairwise-analysis"); /* create the base dir for this job. Delete and recreates if it exists */ Path hdMainDir = new Path(msc.fall2015.stock.kmeans.utils.Constants.HDFS_HOME_PATH + "swg-hadoop"); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdInputDir = new Path(hdMainDir, "data"); if (!fs.mkdirs(hdInputDir)) { throw new IOException("Mkdirs failed to create" + "/swg-hadoop/data"); } int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize); int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2; System.out.println("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :" + blockSize); // Retrieving the configuration form the job to set the properties // Setting properties to the original conf does not work (possible // Hadoop bug) Configuration jobConf = job.getConfiguration(); // Input dir in HDFS. Create this in newly created job base dir Path inputDir = new Path(hdMainDir, "input"); if (!fs.mkdirs(inputDir)) { throw new IOException("Mkdirs failed to create " + inputDir.toString()); } Long dataPartitionStartTime = System.nanoTime(); partitionData(sequenceFile, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir); distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions); long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000; System.out.println("Data Partition & Scatter Completed in (ms):" + dataPartTime); // Output dir in HDFS Path hdOutDir = new Path(hdMainDir, "out"); jobConf.setInt(Constants.BLOCK_SIZE, blockSize); jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions); jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences); jobConf.setBoolean(Constants.WEIGHT_ENABLED, weightCalculate); job.setJarByClass(PairWiseAlignment.class); job.setMapperClass(SWGMap.class); job.setReducerClass(SWGReduce.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SWGWritable.class); FileInputFormat.setInputPaths(job, hdInputDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks((int) noOfDivisions); long startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; double executionTime = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + executionTime + " seconds"); if (args.length == 5) { FileWriter writer = new FileWriter(args[4]); writer.write("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput"); writer.write("\n"); writer.write(noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFile + "\t" + dataPartTime + "\t" + hdMainDir); writer.write("\n"); writer.flush(); writer.close(); } return exitStatus; } private void distributeData(int blockSize, Configuration conf, FileSystem fs, Path hdInputDir, int noOfDivisions) throws IOException { // Writing block meta data to for each block in a separate file so that // Hadoop will create separate Map tasks for each block.. // Key : block number // Value: row#column#isDiagonal#base_file_name // TODO : find a better way to do this. for (int row = 0; row < noOfDivisions; row++) { for (int column = 0; column < noOfDivisions; column++) { // using the load balancing algorithm to select the blocks // include the diagonal blocks as they are blocks, not // individual pairs if (((row >= column) & ((row + column) % 2 == 0)) | ((row <= column) & ((row + column) % 2 == 1))) { Path vFile = new Path(hdInputDir, "data_file_" + row + "_" + column); SequenceFile.Writer vWriter = SequenceFile.createWriter(fs, conf, vFile, LongWritable.class, Text.class, CompressionType.NONE); boolean isDiagonal = false; if (row == column) { isDiagonal = true; } String value = row + Constants.BREAK + column + Constants.BREAK + isDiagonal + Constants.BREAK + Constants.HDFS_SEQ_FILENAME; vWriter.append(new LongWritable(row * blockSize + column), new Text(value)); vWriter.close(); } } } } private void partitionData(String sequenceFile, int noOfSequences, int blockSize, FileSystem fs, int noOfDivisions, Configuration jobConf, Path inputDir) throws FileNotFoundException, IOException, URISyntaxException { // Break the sequences file in to parts based on the block size. Stores // the parts in HDFS and add them to the Hadoop distributed cache. Path path = new Path(sequenceFile); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(; System.out.println("noOfDivisions : " + noOfDivisions); System.out.println("blockSize : " + blockSize); for (int partNo = 0; partNo < noOfDivisions; partNo++) { // String filePartName = Constants.HDFS_SEQ_FILENAME + "_" + partNo; Path inputFilePart = new Path(inputDir, filePartName); OutputStream partOutStream = fs.create(inputFilePart); BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(partOutStream)); for (int sequenceIndex = 0; ((sequenceIndex < blockSize) & (sequenceIndex + (partNo * blockSize) < noOfSequences)); sequenceIndex++) { String line; line = bufferedReader.readLine(); if (line == null) { throw new IOException("Cannot read the sequence from input file."); } // write the sequence name bufferedWriter.write(line); bufferedWriter.newLine(); } bufferedWriter.flush(); bufferedWriter.close(); // Adding the sequences file to Hadoop cache URI cFileURI = new URI(inputFilePart.toUri() + "#" + filePartName); DistributedCache.addCacheFile(cFileURI, jobConf); DistributedCache.createSymlink(jobConf); } } }