List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:com.mellanox.r4h.DistributedFileSystem.java
License:Apache License
@Override public BlockLocation[] getFileBlockLocations(Path p, final long start, final long len) throws IOException { statistics.incrementReadOps(1);/*from www.j a v a2 s .co m*/ final Path absF = fixRelativePart(p); return new FileSystemLinkResolver<BlockLocation[]>() { @Override public BlockLocation[] doCall(final Path p) throws IOException, UnresolvedLinkException { return dfs.getBlockLocations(getPathName(p), start, len); } @Override public BlockLocation[] next(final FileSystem fs, final Path p) throws IOException { return fs.getFileBlockLocations(p, start, len); } }.resolve(this, absF); }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public FileSplit createFileSplitFromBSON(final BSONObject obj, final FileSystem fs, final FileStatus inputFile) throws IOException { long start = (Long) obj.get("s"); long splitLen = (Long) obj.get("l"); try {/*from w ww .j av a 2 s . c o m*/ BlockLocation[] blkLocations = fs.getFileBlockLocations(inputFile, start, splitLen); int blockIndex = getLargestBlockIndex(blkLocations); return new FileSplit(inputFile.getPath(), start, splitLen, blkLocations[blockIndex].getHosts()); } catch (IOException e) { LOG.warn( "Couldn't find block locations when constructing input split from BSON. Using non-block-aware input split; " + e.getMessage()); return new FileSplit(inputFile.getPath(), start, splitLen, null); } }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public FileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart, final long splitLen) { try {//from ww w . jav a2 s .c o m BlockLocation[] blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen); int blockIndex = getLargestBlockIndex(blkLocations); return new FileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts()); } catch (IOException e) { LOG.warn( "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; " + e.getMessage()); return new FileSplit(inFile.getPath(), splitStart, splitLen, null); } }
From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java
License:Apache License
/** * The main thing that the addSSTableSplit handles is to split SSTables * using their index if available. The general algorithm is that if the file * is large than the blocksize plus some fuzzy factor to *///from ww w. ja va 2 s.c o m public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (length != 0) { long blockSize = file.getBlockSize(); long maxSplitSize = (long) (blockSize * .99); long fuzzySplit = (long) (blockSize * 1.2); long bytesRemaining = length; Iterator<Long> scanner = null; Path compressionPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-CompressionInfo.db")); if (!fs.exists(compressionPath)) { // Only initialize if we are going to have more than a single // split if (fuzzySplit < length) { Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db")); if (!fs.exists(indexPath)) { fuzzySplit = length; } else { FSDataInputStream fileIn = fs.open(indexPath); scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)), indexPath.getName()); } } long splitStart = 0; while (splitStart + fuzzySplit < length && scanner.hasNext()) { long splitSize = 0; // The scanner returns an offset from the start of the file. while (splitSize < maxSplitSize && scanner.hasNext()) { splitSize = scanner.next() - splitStart; } int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2)); LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize); splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(), convertors)); bytesRemaining -= splitSize; splitStart += splitSize; } } if (bytesRemaining != 0) { LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining); splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath), compressionPath)); } } else { LOG.info("skipping zero length file: " + path.toString()); } }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { final List<InputSplit> splits = new ArrayList<InputSplit>(); final List<FileStatus> files = listStatus(jobContext); for (FileStatus file : files) { final Path path = file.getPath(); final FileSystem fs = path.getFileSystem(jobContext.getConfiguration()); final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); final List<String> blkHosts = new ArrayList<String>(); for (final BlockLocation location : blkLocations) { blkHosts.addAll(Arrays.asList(location.getHosts())); }/* w w w . jav a 2 s . c o m*/ // TODO Split files =) final String[] hosts = blkHosts.toArray(new String[0]); splits.add(new FileSplit(path, 0, file.getLen(), hosts)); } return splits; }
From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * /*from w w w .j av a2 s. c o m*/ * @param job * the job context * @throws IOException */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); for (int i = 0; i < fileStatuses.length; ++i) { HAWQAOFileStatus aofilestatus = fileStatuses[i]; String pathStr = aofilestatus.getFilePath(); long fileLength = aofilestatus.getFileLength(); if (fileLength == 0) continue; boolean checksum = aofilestatus.getChecksum(); String compressType = aofilestatus.getCompressType(); int blocksize = aofilestatus.getBlockSize(); Path path = new Path(pathStr); if (fileLength != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength); // not splitable splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType, blocksize)); } else { // Create empty hosts array for zero length files splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize)); } } job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:com.ricemap.spateDB.mapred.IndexedPrism.java
License:Apache License
@SuppressWarnings("unchecked") @Override/* w w w . j a v a 2 s . com*/ public InputSplit[] getSplits(final JobConf job, int numSplits) throws IOException { // Get a list of all input files. There should be exactly two files. final Path[] inputFiles = getInputPaths(job); GlobalIndex<Partition> gIndexes[] = new GlobalIndex[inputFiles.length]; BlockFilter blockFilter = null; try { Class<? extends BlockFilter> blockFilterClass = job.getClass(SpatialSite.FilterClass, null, BlockFilter.class); if (blockFilterClass != null) { // Get all blocks the user wants to process blockFilter = blockFilterClass.newInstance(); blockFilter.configure(job); } } catch (InstantiationException e1) { e1.printStackTrace(); } catch (IllegalAccessException e1) { e1.printStackTrace(); } if (blockFilter != null) { // Extract global indexes from input files for (int i_file = 0; i_file < inputFiles.length; i_file++) { FileSystem fs = inputFiles[i_file].getFileSystem(job); gIndexes[i_file] = SpatialSite.getGlobalIndex(fs, inputFiles[i_file]); } } final Vector<CombineFileSplit> matchedSplits = new Vector<CombineFileSplit>(); if (gIndexes[0] == null || gIndexes[1] == null) { // Join every possible pair (Cartesian product) BlockLocation[][] fileBlockLocations = new BlockLocation[inputFiles.length][]; for (int i_file = 0; i_file < inputFiles.length; i_file++) { FileSystem fs = inputFiles[i_file].getFileSystem(job); FileStatus fileStatus = fs.getFileStatus(inputFiles[i_file]); fileBlockLocations[i_file] = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); } LOG.info("Doing a Cartesian product of blocks: " + fileBlockLocations[0].length + "x" + fileBlockLocations[1].length); for (BlockLocation block1 : fileBlockLocations[0]) { for (BlockLocation block2 : fileBlockLocations[1]) { FileSplit fsplit1 = new FileSplit(inputFiles[0], block1.getOffset(), block1.getLength(), block1.getHosts()); FileSplit fsplit2 = new FileSplit(inputFiles[1], block2.getOffset(), block2.getLength(), block2.getHosts()); CombineFileSplit combinedSplit = (CombineFileSplit) FileSplitUtil.combineFileSplits(job, fsplit1, fsplit2); matchedSplits.add(combinedSplit); } } } else { // Filter block pairs by the BlockFilter blockFilter.selectCellPairs(gIndexes[0], gIndexes[1], new ResultCollector2<Partition, Partition>() { @Override public void collect(Partition p1, Partition p2) { try { List<FileSplit> splits1 = new ArrayList<FileSplit>(); Path path1 = new Path(inputFiles[0], p1.filename); splitFile(job, path1, splits1); List<FileSplit> splits2 = new ArrayList<FileSplit>(); Path path2 = new Path(inputFiles[1], p2.filename); splitFile(job, path2, splits2); for (FileSplit split1 : splits1) { for (FileSplit split2 : splits2) { matchedSplits.add( (CombineFileSplit) FileSplitUtil.combineFileSplits(job, split1, split2)); } } } catch (IOException e) { e.printStackTrace(); } } }); } LOG.info("Matched " + matchedSplits.size() + " combine splits"); // Return all matched splits return matchedSplits.toArray(new InputSplit[matchedSplits.size()]); }
From source file:com.ricemap.spateDB.mapred.IndexedPrism.java
License:Apache License
public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); FileSystem fs = path.getFileSystem(job); FileStatus file = fs.getFileStatus(path); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (length != 0) { long blockSize = file.getBlockSize(); long splitSize = blockSize; long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; }// w ww .j av a 2 s . c o m if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } }
From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java
License:Apache License
/** * Combines a number of input splits into the given numSplits. * @param conf/*from w w w.j a va 2s . co m*/ * @param inputSplits * @param numSplits * @return * @throws IOException */ public static InputSplit[] autoCombineSplits(JobConf conf, Vector<FileSplit> inputSplits, int numSplits) throws IOException { LOG.info("Combining " + inputSplits.size() + " splits into " + numSplits); Map<String, Vector<FileSplit>> blocksPerHost = new HashMap<String, Vector<FileSplit>>(); for (FileSplit fsplit : inputSplits) { // Get locations for this split final Path path = fsplit.getPath(); final FileSystem fs = path.getFileSystem(conf); BlockLocation[] blockLocations = fs.getFileBlockLocations(fs.getFileStatus(path), fsplit.getStart(), fsplit.getLength()); for (BlockLocation blockLocation : blockLocations) { for (String hostName : blockLocation.getHosts()) { if (!blocksPerHost.containsKey(hostName)) blocksPerHost.put(hostName, new Vector<FileSplit>()); blocksPerHost.get(hostName).add(fsplit); } } } // If the user requested a fewer number of splits, start to combine them InputSplit[] combined_splits = new InputSplit[numSplits]; int splitsAvailable = inputSplits.size(); for (int i = 0; i < numSplits; i++) { // Decide how many splits to combine int numSplitsToCombine = splitsAvailable / (numSplits - i); Vector<FileSplit> splitsToCombine = new Vector<FileSplit>(); while (numSplitsToCombine > 0) { // Choose the host with minimum number of splits Map.Entry<String, Vector<FileSplit>> minEntry = null; for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) { if (minEntry == null || entry.getValue().size() < minEntry.getValue().size()) { minEntry = entry; } } // Combine all or some of blocks in this host for (FileSplit fsplit : minEntry.getValue()) { if (!splitsToCombine.contains(fsplit)) { splitsToCombine.add(fsplit); if (--numSplitsToCombine == 0) break; } } if (numSplitsToCombine != 0) { // Remove this host so that it is not selected again blocksPerHost.remove(minEntry.getKey()); } } combined_splits[i] = combineFileSplits(conf, splitsToCombine, 0, splitsToCombine.size()); for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) { entry.getValue().removeAll(splitsToCombine); } splitsAvailable -= splitsToCombine.size(); } LOG.info("Combined splits " + combined_splits.length); return combined_splits; }
From source file:com.ricemap.spateDB.util.ReadFile.java
License:Apache License
public static void main(String[] args) throws Exception { CommandLineArguments cla = new CommandLineArguments(args); Path input = cla.getPath();//from w w w . ja v a 2 s. c om if (input == null) { printUsage(); throw new RuntimeException("Illegal parameters"); } Configuration conf = new Configuration(); Path inFile = new Path(args[0]); FileSystem fs = inFile.getFileSystem(conf); long length = fs.getFileStatus(inFile).getLen(); GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile); if (gindex == null) { BlockLocation[] locations = cla.getOffset() == -1 ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length) : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getOffset(), 1); System.out.println(locations.length + " heap blocks"); } else { for (Partition p : gindex) { long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen(); System.out.println(p + " --- " + partition_length); } } }