Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException 

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.mellanox.r4h.DistributedFileSystem.java

License:Apache License

@Override
public BlockLocation[] getFileBlockLocations(Path p, final long start, final long len) throws IOException {
    statistics.incrementReadOps(1);/*from  www.j a v  a2  s .co m*/
    final Path absF = fixRelativePart(p);
    return new FileSystemLinkResolver<BlockLocation[]>() {
        @Override
        public BlockLocation[] doCall(final Path p) throws IOException, UnresolvedLinkException {
            return dfs.getBlockLocations(getPathName(p), start, len);
        }

        @Override
        public BlockLocation[] next(final FileSystem fs, final Path p) throws IOException {
            return fs.getFileBlockLocations(p, start, len);
        }
    }.resolve(this, absF);
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public FileSplit createFileSplitFromBSON(final BSONObject obj, final FileSystem fs, final FileStatus inputFile)
        throws IOException {
    long start = (Long) obj.get("s");
    long splitLen = (Long) obj.get("l");
    try {/*from w ww .j av a 2 s  . c  o  m*/
        BlockLocation[] blkLocations = fs.getFileBlockLocations(inputFile, start, splitLen);
        int blockIndex = getLargestBlockIndex(blkLocations);
        return new FileSplit(inputFile.getPath(), start, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from BSON. Using non-block-aware input split; "
                        + e.getMessage());
        return new FileSplit(inputFile.getPath(), start, splitLen, null);
    }
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public FileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart,
        final long splitLen) {
    try {//from ww  w . jav  a2 s  .c o m
        BlockLocation[] blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
        int blockIndex = getLargestBlockIndex(blkLocations);
        return new FileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; "
                        + e.getMessage());
        return new FileSplit(inFile.getPath(), splitStart, splitLen, null);
    }
}

From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java

License:Apache License

/**
 * The main thing that the addSSTableSplit handles is to split SSTables
 * using their index if available. The general algorithm is that if the file
 * is large than the blocksize plus some fuzzy factor to
 *///from  ww  w. ja  va 2 s.c  o  m
public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long maxSplitSize = (long) (blockSize * .99);
        long fuzzySplit = (long) (blockSize * 1.2);

        long bytesRemaining = length;

        Iterator<Long> scanner = null;
        Path compressionPath = new Path(path.getParent(),
                path.getName().replaceAll("-Data.db", "-CompressionInfo.db"));
        if (!fs.exists(compressionPath)) {
            // Only initialize if we are going to have more than a single
            // split
            if (fuzzySplit < length) {
                Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db"));
                if (!fs.exists(indexPath)) {
                    fuzzySplit = length;
                } else {
                    FSDataInputStream fileIn = fs.open(indexPath);
                    scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)),
                            indexPath.getName());
                }
            }
            long splitStart = 0;
            while (splitStart + fuzzySplit < length && scanner.hasNext()) {
                long splitSize = 0;
                // The scanner returns an offset from the start of the file.
                while (splitSize < maxSplitSize && scanner.hasNext()) {
                    splitSize = scanner.next() - splitStart;
                }
                int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2));
                LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize);
                splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(),
                        convertors));
                bytesRemaining -= splitSize;
                splitStart += splitSize;
            }
        }

        if (bytesRemaining != 0) {
            LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining);
            splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath),
                    compressionPath));
        }
    } else {
        LOG.info("skipping zero length file: " + path.toString());
    }
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    final List<InputSplit> splits = new ArrayList<InputSplit>();
    final List<FileStatus> files = listStatus(jobContext);
    for (FileStatus file : files) {
        final Path path = file.getPath();
        final FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
        final List<String> blkHosts = new ArrayList<String>();
        for (final BlockLocation location : blkLocations) {
            blkHosts.addAll(Arrays.asList(location.getHosts()));
        }/* w  w w .  jav  a  2  s  .  c  o  m*/

        // TODO Split files =)
        final String[] hosts = blkHosts.toArray(new String[0]);
        splits.add(new FileSplit(path, 0, file.getLen(), hosts));
    }

    return splits;
}

From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 * /*from   w w  w .j  av  a2  s. c  o  m*/
 * @param job
 *            the job context
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < fileStatuses.length; ++i) {
        HAWQAOFileStatus aofilestatus = fileStatuses[i];
        String pathStr = aofilestatus.getFilePath();
        long fileLength = aofilestatus.getFileLength();
        if (fileLength == 0)
            continue;

        boolean checksum = aofilestatus.getChecksum();
        String compressType = aofilestatus.getCompressType();
        int blocksize = aofilestatus.getBlockSize();
        Path path = new Path(pathStr);
        if (fileLength != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength);
            // not splitable
            splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType,
                    blocksize));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize));
        }
    }
    job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.ricemap.spateDB.mapred.IndexedPrism.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*  w w w  .  j  a v  a  2  s .  com*/
public InputSplit[] getSplits(final JobConf job, int numSplits) throws IOException {
    // Get a list of all input files. There should be exactly two files.
    final Path[] inputFiles = getInputPaths(job);
    GlobalIndex<Partition> gIndexes[] = new GlobalIndex[inputFiles.length];

    BlockFilter blockFilter = null;
    try {
        Class<? extends BlockFilter> blockFilterClass = job.getClass(SpatialSite.FilterClass, null,
                BlockFilter.class);
        if (blockFilterClass != null) {
            // Get all blocks the user wants to process
            blockFilter = blockFilterClass.newInstance();
            blockFilter.configure(job);
        }
    } catch (InstantiationException e1) {
        e1.printStackTrace();
    } catch (IllegalAccessException e1) {
        e1.printStackTrace();
    }

    if (blockFilter != null) {
        // Extract global indexes from input files

        for (int i_file = 0; i_file < inputFiles.length; i_file++) {
            FileSystem fs = inputFiles[i_file].getFileSystem(job);
            gIndexes[i_file] = SpatialSite.getGlobalIndex(fs, inputFiles[i_file]);
        }
    }

    final Vector<CombineFileSplit> matchedSplits = new Vector<CombineFileSplit>();
    if (gIndexes[0] == null || gIndexes[1] == null) {
        // Join every possible pair (Cartesian product)
        BlockLocation[][] fileBlockLocations = new BlockLocation[inputFiles.length][];
        for (int i_file = 0; i_file < inputFiles.length; i_file++) {
            FileSystem fs = inputFiles[i_file].getFileSystem(job);
            FileStatus fileStatus = fs.getFileStatus(inputFiles[i_file]);
            fileBlockLocations[i_file] = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        }
        LOG.info("Doing a Cartesian product of blocks: " + fileBlockLocations[0].length + "x"
                + fileBlockLocations[1].length);
        for (BlockLocation block1 : fileBlockLocations[0]) {
            for (BlockLocation block2 : fileBlockLocations[1]) {
                FileSplit fsplit1 = new FileSplit(inputFiles[0], block1.getOffset(), block1.getLength(),
                        block1.getHosts());
                FileSplit fsplit2 = new FileSplit(inputFiles[1], block2.getOffset(), block2.getLength(),
                        block2.getHosts());
                CombineFileSplit combinedSplit = (CombineFileSplit) FileSplitUtil.combineFileSplits(job,
                        fsplit1, fsplit2);
                matchedSplits.add(combinedSplit);
            }
        }
    } else {
        // Filter block pairs by the BlockFilter
        blockFilter.selectCellPairs(gIndexes[0], gIndexes[1], new ResultCollector2<Partition, Partition>() {
            @Override
            public void collect(Partition p1, Partition p2) {
                try {
                    List<FileSplit> splits1 = new ArrayList<FileSplit>();
                    Path path1 = new Path(inputFiles[0], p1.filename);
                    splitFile(job, path1, splits1);

                    List<FileSplit> splits2 = new ArrayList<FileSplit>();
                    Path path2 = new Path(inputFiles[1], p2.filename);
                    splitFile(job, path2, splits2);

                    for (FileSplit split1 : splits1) {
                        for (FileSplit split2 : splits2) {
                            matchedSplits.add(
                                    (CombineFileSplit) FileSplitUtil.combineFileSplits(job, split1, split2));
                        }
                    }

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }

    LOG.info("Matched " + matchedSplits.size() + " combine splits");

    // Return all matched splits
    return matchedSplits.toArray(new InputSplit[matchedSplits.size()]);
}

From source file:com.ricemap.spateDB.mapred.IndexedPrism.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }// w  ww  .j  av a  2 s  . c o  m

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines a number of input splits into the given numSplits.
 * @param conf/*from  w w  w.j  a  va  2s . co  m*/
 * @param inputSplits
 * @param numSplits
 * @return
 * @throws IOException 
 */
public static InputSplit[] autoCombineSplits(JobConf conf, Vector<FileSplit> inputSplits, int numSplits)
        throws IOException {
    LOG.info("Combining " + inputSplits.size() + " splits into " + numSplits);
    Map<String, Vector<FileSplit>> blocksPerHost = new HashMap<String, Vector<FileSplit>>();
    for (FileSplit fsplit : inputSplits) {
        // Get locations for this split
        final Path path = fsplit.getPath();
        final FileSystem fs = path.getFileSystem(conf);
        BlockLocation[] blockLocations = fs.getFileBlockLocations(fs.getFileStatus(path), fsplit.getStart(),
                fsplit.getLength());
        for (BlockLocation blockLocation : blockLocations) {
            for (String hostName : blockLocation.getHosts()) {
                if (!blocksPerHost.containsKey(hostName))
                    blocksPerHost.put(hostName, new Vector<FileSplit>());
                blocksPerHost.get(hostName).add(fsplit);
            }
        }
    }

    // If the user requested a fewer number of splits, start to combine them
    InputSplit[] combined_splits = new InputSplit[numSplits];
    int splitsAvailable = inputSplits.size();

    for (int i = 0; i < numSplits; i++) {
        // Decide how many splits to combine
        int numSplitsToCombine = splitsAvailable / (numSplits - i);
        Vector<FileSplit> splitsToCombine = new Vector<FileSplit>();
        while (numSplitsToCombine > 0) {
            // Choose the host with minimum number of splits
            Map.Entry<String, Vector<FileSplit>> minEntry = null;
            for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) {
                if (minEntry == null || entry.getValue().size() < minEntry.getValue().size()) {
                    minEntry = entry;
                }
            }
            // Combine all or some of blocks in this host
            for (FileSplit fsplit : minEntry.getValue()) {
                if (!splitsToCombine.contains(fsplit)) {
                    splitsToCombine.add(fsplit);
                    if (--numSplitsToCombine == 0)
                        break;
                }
            }
            if (numSplitsToCombine != 0) {
                // Remove this host so that it is not selected again
                blocksPerHost.remove(minEntry.getKey());
            }
        }

        combined_splits[i] = combineFileSplits(conf, splitsToCombine, 0, splitsToCombine.size());

        for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) {
            entry.getValue().removeAll(splitsToCombine);
        }
        splitsAvailable -= splitsToCombine.size();
    }

    LOG.info("Combined splits " + combined_splits.length);
    return combined_splits;
}

From source file:com.ricemap.spateDB.util.ReadFile.java

License:Apache License

public static void main(String[] args) throws Exception {
    CommandLineArguments cla = new CommandLineArguments(args);
    Path input = cla.getPath();//from w  w  w .  ja  v a 2  s. c om
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getOffset() == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getOffset(), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}