Java tutorial
/** * Copyright 2012-2013 The Regents of the University of California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations under * the License. * * Author: maha alabduljalil <maha (at) cs.ucsb.edu> * @Since Aug 13, 2012 */ package edu.ucsb.cs.partitioning.statistics; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.SortedSet; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.TreeSet; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import edu.ucsb.cs.partitioning.PartDriver; import edu.ucsb.cs.partitioning.cosine.Organizer; import edu.ucsb.cs.partitioning.jaccard.JaccardCoarsePartitionMain; import edu.ucsb.cs.types.FeatureWeight; import edu.ucsb.cs.types.FeatureWeightArrayWritable; import edu.ucsb.cs.utilities.JobSubmitter; /** * Produces statistics about the data and partitions including maximum, minimum * and average vector length and Baraglia dummy vectors for each partition * stored in "bragliaVectors" , size of each partition both printed and stored * in "vectorsPerPartition" and the whole data dummy vector too. * BaragliaPartition vectors are sorted alphabetically, while whole data vector * is sorted decreasingly by popularity. It also prints number of skipped edges * between partitions and the document pairs comparison omitted. * * It also write a mapfile storing each partition name and its number of docs. * Sept 17th 2013 disabled writing of baraglia vector. * @author Maha * */ public class Collector { private static FeatureWeightArrayWritable maxWeightVector = new FeatureWeightArrayWritable(); // GlobalBaraglia // public static Path baragliaPath = new Path("bragliaVectors"); public static Path partitionSizesPath = new Path("vectorsPerPartition"); // private static MapFile.Writer baragliaWriter; private static MapFile.Writer partitionsWriter; /** * @param input : input path of cosine partitions (Gij ..). */ public static void printCosineStatistics(JobConf job, String input) throws NumberFormatException, IOException { FileSystem fs = (new Path(input)).getFileSystem(job); printCommonStatistics(fs, input, job); String NumSkipPairsEdges = getNSkipCosineVecPairs(fs, new Path(input), job); printSkipInfo(new StringTokenizer(NumSkipPairsEdges, " ,")); } public static void printJaccardStatistics(JobConf job, String input) throws NumberFormatException, IOException { FileSystem fs = (new Path(input)).getFileSystem(job); printCommonStatistics(fs, input, job); String NumSkipPairsEdges = getNSkipJaccardDocPairs(fs, new Path(input), job); printSkipInfo(new StringTokenizer(NumSkipPairsEdges, " ,")); } public static void printGeneralStatistics(JobConf job, String input) throws NumberFormatException, IOException { FileSystem fs = (new Path(input)).getFileSystem(job); printCommonStatistics(fs, input, job); String NumSkipPairsEdges = getNSkipCosineVecPairs(fs, new Path(input), job); printSkipInfo(new StringTokenizer(NumSkipPairsEdges, " ,")); } public static void printCommonStatistics(FileSystem fs, String input, JobConf job) throws IOException { openBaragliaFile(); openPartitionsFile(); String NumMinMaxAvg = getNumMinMaxAvgVecLengthAvgPosting(fs, new Path(input), job); StringTokenizer tkz = new StringTokenizer(NumMinMaxAvg, " ,"); System.out.println("\n [Statistics] "); System.out.println(" Number of partitions: " + getNumFiles(new Path(input), fs) + "\n Number of vectors: " + tkz.nextToken() + "\n Min vector length: " + tkz.nextToken() + "\n Max vector length: " + tkz.nextToken() + "\n Avg vector length: " + tkz.nextToken() + "\n Number of features: " + tkz.nextToken() + "\n Avg posting length: " + tkz.nextToken()); } public static void printSkipInfo(StringTokenizer tkz) throws IOException { System.out.println(" Number of unique skipped vector pairs: " + tkz.nextToken() + " out of " + tkz.nextToken() + "\n Number of distinct partitions edges to skip: " + tkz.nextToken() + " out of " + tkz.nextToken() + "\n" + JobSubmitter.stars()); } public static void openPartitionsFile() throws IOException { JobConf job = new JobConf(); FileSystem fs = partitionSizesPath.getFileSystem(job); if (fs.exists(partitionSizesPath)) fs.delete(partitionSizesPath); partitionsWriter = new MapFile.Writer(job, fs, partitionSizesPath.getName(), Text.class, LongWritable.class); } /** * Prepares a file to contain Baraglia's dummy vector and a dummy vector for * each produced static partition. Works fine but cause out of memory error. */ public static void openBaragliaFile() throws IOException { // JobConf job = new JobConf(); // FileSystem fs = baragliaPath.getFileSystem(job); // if (fs.exists(baragliaPath)) // fs.delete(baragliaPath); // baragliaWriter = new MapFile.Writer(job, fs, baragliaPath.getName(), // Text.class, // FeatureWeightArrayWritable.class); } public static FileStatus[] getFiles(Path inputPath, FileSystem fs) throws IOException { FileStatus[] files = null; if (fs.exists(inputPath)) { if (fs.isFile(inputPath)) { files = new FileStatus[1]; files[0] = new FileStatus(0, false, 1, 1, 1, inputPath); } else files = fs.listStatus(inputPath); } else throw new FileNotFoundException("Error: " + inputPath.getName() + " does not exist."); return files; } public static int getNumFiles(Path inputPath, FileSystem fs) throws IOException { return getFiles(inputPath, fs).length; } /** * @param inputPath: path of all the input files. * @param fs: file system. * @return file paths sorted by file name. */ public static Iterator<Path> getSortedFiles(Path inputPath, FileSystem fs) throws IOException { TreeSet<Path> paths = new TreeSet<Path>(); FileStatus[] files = getFiles(inputPath, fs); for (int i = 0; i < files.length; i++) if (!fs.isDirectory(files[i].getPath())) paths.add(files[i].getPath()); return paths.iterator(); } public static String getNumMinMaxAvgVecLengthAvgPosting(FileSystem fs, Path inputPath, JobConf job) throws IOException { LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); long numDocuments = 0, minDocLength = Long.MAX_VALUE, maxDocLength = 0; double avgDocLength = 0; int partitionSize; // remove HashMap<Long, Float> partitionfeaturesWeight = new HashMap<Long, Float>(); Iterator<Path> pathItr = getSortedFiles(inputPath, fs); if (!pathItr.hasNext()) return "0,0,0,0"; while (pathItr.hasNext()) { inputPath = pathItr.next(); SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job); partitionSize = 0;// remove while (in.next(key, value)) { partitionSize++;// remove numDocuments++; avgDocLength += value.vectorSize; if (minDocLength > value.vectorSize) minDocLength = value.vectorSize; if (maxDocLength < value.vectorSize) maxDocLength = value.vectorSize; for (int j = 0; j < value.vectorSize; j++) { FeatureWeight current = value.vector[j]; updatePartitionBaraglia(partitionfeaturesWeight, current); } } System.out.println(inputPath.getName() + " has " + partitionSize + " vectors."); // remove partitionsWriter.append(new Text(inputPath.getName()), new LongWritable(partitionSize)); in.close(); writePartitionBaraglia(inputPath.getName(), partitionfeaturesWeight); } partitionsWriter.close(); maxWeightVector.clear(); String nFeaturesAvgPost = getNFeaturesAvgPosting(fs, inputPath.getParent(), job); avgDocLength = avgDocLength / numDocuments; return numDocuments + " , " + minDocLength + " , " + maxDocLength + " ," + avgDocLength + " ," + nFeaturesAvgPost; } public static String getNFeaturesAvgPosting(FileSystem fs, Path inputPath, JobConf job) throws IOException { HashMap<Long, Long> globalFeatures = new HashMap<Long, Long>(); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); double sumPostings = 0; Iterator<Path> pathItr = getSortedFiles(inputPath, fs); while (pathItr.hasNext()) { inputPath = pathItr.next(); SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job); while (in.next(key, value)) { for (int j = 0; j < value.vectorSize; j++) if (globalFeatures.containsKey(value.vector[j].feature)) globalFeatures.put(value.vector[j].feature, globalFeatures.get(value.vector[j].getFeature()) + 1); else globalFeatures.put(value.vector[j].feature, 1l); } } Iterator<Long> featuresItr = globalFeatures.keySet().iterator(); while ((featuresItr != null) && (featuresItr.hasNext())) { sumPostings += globalFeatures.get(featuresItr.next()); } return globalFeatures.size() + "," + (sumPostings / globalFeatures.size()); } /** * @param featuresCount : vector with all features and the size of their * postings. * @return sum of of postings lengths of the features. */ public static long getSumPostingLength(TreeMap<Long, Integer> featuresCount) { Iterator<Long> features = featuresCount.keySet().iterator(); long sumPostings = 0; while (features.hasNext()) sumPostings += featuresCount.get(features.next()); return sumPostings; } /** * Writes the passed vector into Baraglia file. * @param featuresWeight : partition features and their maximum weight to be * written to HDFS and added to Baraglia global vector. * @param partition: name of the partition passed. */ public static void writePartitionBaraglia(String partition, HashMap<Long, Float> featuresWeight) throws IOException { maxWeightVector.set(featuresWeight.size()); Iterator<Long> features = featuresWeight.keySet().iterator(); int i = -1; while (features.hasNext()) { long f = features.next(); maxWeightVector.setElement(++i, f, featuresWeight.get(f)); } // baragliaWriter.append(new Text(partition), maxWeightVector); // updateGlobalBaraglia(globalFeatures, featuresWeight); featuresWeight.clear(); } /** * @param globalFeatures : global vector of features to be updated. * @param partitionfeaturesWeight : features and their maximum weight from * the current partition to be added to the global vector. */ public static void updateGlobalBaraglia(HashSet<Long> globalFeatures, HashMap<Long, Float> partitionfeaturesWeight) { Iterator<Long> partfeatures = partitionfeaturesWeight.keySet().iterator(); while (partfeatures.hasNext()) { long feature = partfeatures.next(); if (!globalFeatures.contains(feature)) globalFeatures.add(feature); } } /** * @param map : Unsorted map of features to their count and maximum weight. * @return sorted map based on the features count decreasingly by * popularity. */ public static SortedSet<Map.Entry<Long, CountWeight>> entriesSortedByPopularity(Map<Long, CountWeight> map) { SortedSet<Map.Entry<Long, CountWeight>> sortedEntries = new TreeSet<Map.Entry<Long, CountWeight>>( new Comparator<Map.Entry<Long, CountWeight>>() { public int compare(Map.Entry<Long, CountWeight> e1, Map.Entry<Long, CountWeight> e2) { return e1.getValue().compareTo(e2.getValue()); } }); sortedEntries.addAll(map.entrySet()); return sortedEntries; } /** * Adds the feature and weight of the current vector to Baraglia vector of * the current partition. * @param featuresWeight:Baraglia vector for the current processed * partition. * @param current: current vector being read from the partition. */ public static void updatePartitionBaraglia(HashMap<Long, Float> featuresWeight, FeatureWeight current) { if (featuresWeight.containsKey(current.feature)) { float existing = featuresWeight.get(current.feature); if (existing < current.weight) { existing = current.weight; featuresWeight.put(current.feature, existing); } } else featuresWeight.put(current.feature, current.weight); } public static boolean skip1dCoarseJaccardPartitions(String cfile, String ofile) { Iterator<Integer> itr = JaccardCoarsePartitionMain.skipList.keySet().iterator(); while (itr.hasNext()) { int part1 = itr.next(); if (Integer.parseInt(cfile) == part1) { ArrayList<Integer> iList = JaccardCoarsePartitionMain.skipList.get(part1); for (int k = 0; k < iList.size(); k++) if (iList.get(k) == Integer.parseInt(ofile)) return true; break; } } return false; } public static String getNSkipCosineVecPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException { long nSkipVecPair = 0, nVectors = 0, nSkipPartEdges = 0, nPartitions = 0; FileStatus[] files = getFiles(inputPath, fs); if (files == null) return null; for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if (fs.isDirectory(inputPath)) continue; nPartitions++; long n = countFileVectors(fs, files[i].getPath(), job); nVectors += n; for (int j = i; j < files.length; j++) { inputPath = files[j].getPath(); if (fs.isDirectory(inputPath)) continue; long m = countFileVectors(fs, files[j].getPath(), job); if (skipCosinePartitions(files[i].getPath().getName(), files[j].getPath().getName())) { nSkipVecPair += (n * m); nSkipPartEdges++; } } } return (nSkipVecPair + ",(" + nVectors + "C2)," + nSkipPartEdges + "," + ",(" + nPartitions + "C2),"); } public static boolean skipCosinePartitions(String cfile, String ofile) { int cr = Organizer.getRow(cfile); int cc = Organizer.getCol(cfile); int or = Organizer.getRow(ofile); int oc = Organizer.getCol(ofile); if (((cr != cc) && (cc >= or)) || ((or != oc) && (oc >= cr))) return true; else return false; } /** * Not sure about calculations here anymore .. * @param fs * @param inputPath * @param job * @return * @throws IOException */ public static String getNSkipJaccardDocPairs(FileSystem fs, Path inputPath, JobConf job) throws IOException { long nSkipVecPair = 0, nVecPairs = 0, nSkipPartEdges = 0, nPartitions = 0; FileStatus[] files = getFiles(inputPath, fs); if (files == null) return null; for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if (fs.isDirectory(inputPath)) continue; nPartitions++; for (int j = 0; j < files.length; j++) { inputPath = files[j].getPath(); if (fs.isDirectory(inputPath)) continue; long n = countFileVectors(fs, files[i].getPath(), job); long m = countFileVectors(fs, files[j].getPath(), job); if (skip1dCoarseJaccardPartitions(files[i].getPath().getName(), files[j].getPath().getName())) { nSkipVecPair += (n * m); nSkipPartEdges++; } nVecPairs += (n * m); } } return (nSkipVecPair / 2 + "," + nVecPairs / 2 + "," + nSkipPartEdges / 2 + "," + ",(" + nPartitions + "C2),"); } public static long countFileVectors(FileSystem fs, Path inputFile, JobConf job) throws IOException { long nDocuments = 0; LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); if ((fs.isDirectory(inputFile)) || inputFile.getName().startsWith("_")) return 0; SequenceFile.Reader in = new SequenceFile.Reader(fs, inputFile, job); while (in.next(key, value)) nDocuments++; in.close(); return nDocuments; } public static long countDirVectors(FileSystem fs, Path inputDir, JobConf job) throws IOException { long nDocuments = 0; FileStatus[] files = getFiles(inputDir, fs); for (int i = 0; i < files.length; i++) { nDocuments += countFileVectors(fs, files[i].getPath(), job); } return nDocuments; } public static int getNumPartPairs(HashMap<Integer, ArrayList<Integer>> list) { int numPairs = 0; Iterator<Integer> itr = list.keySet().iterator(); while (itr.hasNext()) { ArrayList<Integer> iList = list.get(itr.next()); numPairs += iList.size(); } return numPairs / 2; // distinct } }