Java tutorial
/** * QCRI, sPCA LICENSE * sPCA is a scalable implementation of Principal Component Analysis (PCA) on of Spark and MapReduce * * Copyright (c) 2015, Qatar Foundation for Education, Science and Community Development (on * behalf of Qatar Computing Research Institute) having its principle place of business in Doha, * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI") * */ package org.qcri.pca; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.util.ToolRunner; public class FileFormat extends AbstractJob { private final static Logger log = LoggerFactory.getLogger(FileFormat.class); public enum OutputFormat { DENSE, //Dense matrix LIL, //List of lists COO, //Coordinate List } public enum InputFormat { DENSE, COO } @Override public int run(String[] arg0) throws Exception { final String inputPath; final int cardinality; final String outputPath; final InputFormat inputFormat; Configuration conf = getConf(); System.out.println(conf.get("Input")); try { inputPath = System.getProperty("Input"); if (inputPath == null) throw new IllegalArgumentException(); } catch (Exception e) { printLogMessage("Input"); return -1; } try { inputFormat = InputFormat.valueOf(System.getProperty("InputFmt")); } catch (IllegalArgumentException e) { log.warn("Invalid Format " + System.getProperty("InputFmt")); return -1; } catch (Exception e) { printLogMessage("InputFmt"); return -1; } try { outputPath = System.getProperty("Output"); if (outputPath == null) throw new IllegalArgumentException(); File outputFile = new File(outputPath); if (outputFile.isFile() || outputFile == null) { log.error("Output Path must be a directory, " + outputPath + " is either not a directory or not a valid path"); return -1; } } catch (Exception e) { printLogMessage("Output"); return -1; } try { cardinality = Integer.parseInt(System.getProperty("Cardinality")); } catch (Exception e) { printLogMessage("Cardinality"); return -1; } int base = -1; try { base = Integer.parseInt(System.getProperty("Base")); } catch (Exception e) { log.warn( "It is not specified whether the input is zero-based or one-based, this parameter is useful only if the input is in COO format"); } switch (inputFormat) { case COO: if (base == -1) { log.error( "You have to specify whether the rows and columns IDs start with 0 or 1 using the argument -DBase"); return -1; } convertFromCooToSeq(inputPath, cardinality, base, outputPath); break; case DENSE: convertFromDenseToSeq(inputPath, cardinality, outputPath); break; } return 0; } public static void main(String[] args) { try { ToolRunner.run(new Configuration(), new SPCADriver(), args); } catch (Exception e) { e.printStackTrace(); } } public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) { try { final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNumber = 0; String thisLine; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); Vector vector = null; String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here if (thisLine.isEmpty()) continue; String[] splitted = thisLine.split("\\s+"); vector = new SequentialAccessSparseVector(splitted.length); for (int i = 0; i < splitted.length; i++) { vector.set(i, Double.parseDouble(splitted[i])); } key.set(lineNumber); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row lineNumber++; } writer.close(); } } catch (Exception e) { e.printStackTrace(); } } public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) { try { final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = null; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); Vector vector = null; String thisLine; int prevRowID = -1; boolean first = true; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here String[] splitted = thisLine.split(","); int rowID = Integer.parseInt(splitted[0]); int colID = Integer.parseInt(splitted[1]); double element = Double.parseDouble(splitted[2]); if (first) { first = false; vector = new SequentialAccessSparseVector(cardinality); } else if (rowID != prevRowID) { key.set(prevRowID); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row vector = new SequentialAccessSparseVector(cardinality); } prevRowID = rowID; vector.set(colID - base, element); } /*//here we append the last vector in each file (assuming that we will start a new row in the next file key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key,value);//write last row writer.close(); */ } if (writer != null) //append last vector in last file { key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key, value);//write last row writer.close(); } } catch (Exception e) { e.printStackTrace(); } } private static void printLogMessage(String argName) { log.error("Missing arguments -D" + argName); log.info( "Usage: -DInput=<path/to/input/matrix> -DOutput=<path/to/outputfolder> -DInputFmt=<DENSE/COO> -DCardinaality=<number of columns> [-DBase=<0/1>(0 if input is zero-based, 1 if input is 1-based]"); } }