Java tutorial
/* * Terrier - Terabyte Retriever * Webpage: http://terrier.org * Contact: terrier{a.}dcs.gla.ac.uk * University of Glasgow - School of Computing Science * http://www.gla.uk * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is MultiFileCollectionInputFormat.java. * * The Original Code is Copyright (C) 2004-2014 the University of Glasgow. * All Rights Reserved. * * Contributor(s): * Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author) * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> */ package org.terrier.structures.indexing.singlepass.hadoop; import gnu.trove.TObjectLongProcedure; import gnu.trove.TObjectLongHashMap; import java.io.IOException; import java.lang.reflect.Array; import java.util.Arrays; import java.util.Comparator; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MultiFileInputFormat; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.CombineFileSplit; import org.apache.log4j.Logger; import org.terrier.indexing.Document; /** * Input Format Class for Hadoop Indexing. Splits the input collection into * sets of files where each Map task gets about the same number of files. * Files are assumed to be un-splittable and are not split. Splits are of * adjacent files - i.e. split 0 always has the first file, and the last * split always has the last file. Any given split will have adjacent files. * @author Richard McCreadie and Craig Macdonald * @since 2.2 */ @SuppressWarnings("deprecation") public class MultiFileCollectionInputFormat extends MultiFileInputFormat<Text, SplitAwareWrapper<Document>> { /** logger for this class */ protected static final Logger logger = Logger.getLogger(MultiFileCollectionInputFormat.class); @SuppressWarnings("unchecked") @Override /** * Instantiates a FileCollectionRecordReader using the specified spit (which is * assumed to be a CombineFileSplit. * @param genericSplit contains files to be processed, assumed to be a CombineFileSplit * @param job JobConf of this job * @param reported To report progress */ public RecordReader<Text, SplitAwareWrapper<Document>> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new FileCollectionRecordReader(job, (PositionAwareSplit<CombineFileSplit>) genericSplit); } @SuppressWarnings("unchecked") @Override /** * Splits the input collection into * sets of files where each Map task * gets about the same number of files */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); // HADOOP-1818: Manage splits only if there are paths if (paths.length == 0) { return new InputSplit[0]; } if (numSplits > paths.length) { numSplits = paths.length; } else if (numSplits < 1) { numSplits = 1; } logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks"); List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>( numSplits); final int numPaths = paths.length; long[] lengths = new long[numPaths]; TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array .newInstance(TObjectLongHashMap.class, numPaths); final FileSystem fs = FileSystem.get(job); for (int i = 0; i < paths.length; i++) { final FileStatus fss = fs.getFileStatus(paths[i]); lengths[i] = fss.getLen(); final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>(); final long normalblocksize = fss.getBlockSize(); for (long offset = 0; offset < lengths[i]; offset += normalblocksize) { final long blocksize = Math.min(offset + normalblocksize, lengths[i]); final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize); for (BlockLocation bl : blockLocations) { for (String host : bl.getHosts()) { location2size.adjustOrPutValue(host, blocksize, blocksize); } } } } //we need to over-estimate using ceil, to ensure that the last split is not /too/ big final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits); int pathsUsed = 0; int splitnum = 0; CombineFileSplit mfs; // for each split except the last one (which may be smaller than numberOfFilesPerSplit) while (pathsUsed < numPaths) { /* caclulate split size for this task - usually numberOfFilesPerSplit, but * less than this for the last split */ final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed : numberOfFilesPerSplit; //arrays of information for split Path[] splitPaths = new Path[splitSizeForThisSplit]; long[] splitLengths = new long[splitSizeForThisSplit]; long[] splitStarts = new long[splitSizeForThisSplit]; final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>(); String[] splitLocations = null; //final recommended locations for this split. for (int i = 0; i < splitSizeForThisSplit; i++) { locations[pathsUsed + i].forEachEntry(new TObjectLongProcedure<String>() { public boolean execute(String a, long b) { allLocationsForSplit.adjustOrPutValue(a, b, b); return true; } }); if (allLocationsForSplit.size() <= 3) { splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); } else { String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); Arrays.sort(hosts, new Comparator<String>() { public int compare(String o1, String o2) { long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2); if (diffamount > 0) { return -1; } else if (diffamount < 0) { return 1; } return 0; } }); splitLocations = new String[3]; System.arraycopy(hosts, 0, splitLocations, 0, 3); } } //copy information for this split System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit); System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit); //count the number of paths consumed pathsUsed += splitSizeForThisSplit; //make the actual split object //logger.info("New split of size " + splitSizeForThisSplit); mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations); splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum)); splitnum++; } if (!(pathsUsed == paths.length)) { throw new IOException("Number of used paths does not equal total available paths!"); } return splits.toArray(new PositionAwareSplit[splits.size()]); } }