Example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileBlockLocations.

Prototype

public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException

Source Link

Document

Return an array containing hostnames, offset and size of portions of the given file.

Usage

From source file:com.mellanox.r4h.DistributedFileSystem.java

License:Apache License

@Override
public BlockLocation[] getFileBlockLocations(Path p, final long start, final long len) throws IOException {
    statistics.incrementReadOps(1);/*from  www.j a v  a2  s .co m*/
    final Path absF = fixRelativePart(p);
    return new FileSystemLinkResolver<BlockLocation[]>() {
        @Override
        public BlockLocation[] doCall(final Path p) throws IOException, UnresolvedLinkException {
            return dfs.getBlockLocations(getPathName(p), start, len);
        }

        @Override
        public BlockLocation[] next(final FileSystem fs, final Path p) throws IOException {
            return fs.getFileBlockLocations(p, start, len);
        }
    }.resolve(this, absF);
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public FileSplit createFileSplitFromBSON(final BSONObject obj, final FileSystem fs, final FileStatus inputFile)
        throws IOException {
    long start = (Long) obj.get("s");
    long splitLen = (Long) obj.get("l");
    try {/*from w ww .j av a 2 s  . c  o  m*/
        BlockLocation[] blkLocations = fs.getFileBlockLocations(inputFile, start, splitLen);
        int blockIndex = getLargestBlockIndex(blkLocations);
        return new FileSplit(inputFile.getPath(), start, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from BSON. Using non-block-aware input split; "
                        + e.getMessage());
        return new FileSplit(inputFile.getPath(), start, splitLen, null);
    }
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public FileSplit createFileSplit(final FileStatus inFile, final FileSystem fs, final long splitStart,
        final long splitLen) {
    try {//from ww  w . jav  a2 s  .c o m
        BlockLocation[] blkLocations = fs.getFileBlockLocations(inFile, splitStart, splitLen);
        int blockIndex = getLargestBlockIndex(blkLocations);
        return new FileSplit(inFile.getPath(), splitStart, splitLen, blkLocations[blockIndex].getHosts());
    } catch (IOException e) {
        LOG.warn(
                "Couldn't find block locations when constructing input split from byte offset. Using non-block-aware input split; "
                        + e.getMessage());
        return new FileSplit(inFile.getPath(), splitStart, splitLen, null);
    }
}

From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java

License:Apache License

/**
 * The main thing that the addSSTableSplit handles is to split SSTables
 * using their index if available. The general algorithm is that if the file
 * is large than the blocksize plus some fuzzy factor to
 *///from  ww  w. ja  va 2 s.c  o  m
public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long maxSplitSize = (long) (blockSize * .99);
        long fuzzySplit = (long) (blockSize * 1.2);

        long bytesRemaining = length;

        Iterator<Long> scanner = null;
        Path compressionPath = new Path(path.getParent(),
                path.getName().replaceAll("-Data.db", "-CompressionInfo.db"));
        if (!fs.exists(compressionPath)) {
            // Only initialize if we are going to have more than a single
            // split
            if (fuzzySplit < length) {
                Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db"));
                if (!fs.exists(indexPath)) {
                    fuzzySplit = length;
                } else {
                    FSDataInputStream fileIn = fs.open(indexPath);
                    scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)),
                            indexPath.getName());
                }
            }
            long splitStart = 0;
            while (splitStart + fuzzySplit < length && scanner.hasNext()) {
                long splitSize = 0;
                // The scanner returns an offset from the start of the file.
                while (splitSize < maxSplitSize && scanner.hasNext()) {
                    splitSize = scanner.next() - splitStart;
                }
                int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2));
                LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize);
                splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(),
                        convertors));
                bytesRemaining -= splitSize;
                splitStart += splitSize;
            }
        }

        if (bytesRemaining != 0) {
            LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining);
            splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath),
                    compressionPath));
        }
    } else {
        LOG.info("skipping zero length file: " + path.toString());
    }
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    final List<InputSplit> splits = new ArrayList<InputSplit>();
    final List<FileStatus> files = listStatus(jobContext);
    for (FileStatus file : files) {
        final Path path = file.getPath();
        final FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
        final List<String> blkHosts = new ArrayList<String>();
        for (final BlockLocation location : blkLocations) {
            blkHosts.addAll(Arrays.asList(location.getHosts()));
        }/* w  w w .  jav  a  2  s  .  c  o  m*/

        // TODO Split files =)
        final String[] hosts = blkHosts.toArray(new String[0]);
        splits.add(new FileSplit(path, 0, file.getLen(), hosts));
    }

    return splits;
}

From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 * /*from   w w  w .j  av  a2  s. c  o  m*/
 * @param job
 *            the job context
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < fileStatuses.length; ++i) {
        HAWQAOFileStatus aofilestatus = fileStatuses[i];
        String pathStr = aofilestatus.getFilePath();
        long fileLength = aofilestatus.getFileLength();
        if (fileLength == 0)
            continue;

        boolean checksum = aofilestatus.getChecksum();
        String compressType = aofilestatus.getCompressType();
        int blocksize = aofilestatus.getBlockSize();
        Path path = new Path(pathStr);
        if (fileLength != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength);
            // not splitable
            splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType,
                    blocksize));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize));
        }
    }
    job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.ricemap.spateDB.mapred.IndexedPrism.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*  w w w  .  j  a v  a  2  s .  com*/
public InputSplit[] getSplits(final JobConf job, int numSplits) throws IOException {
    // Get a list of all input files. There should be exactly two files.
    final Path[] inputFiles = getInputPaths(job);
    GlobalIndex<Partition> gIndexes[] = new GlobalIndex[inputFiles.length];

    BlockFilter blockFilter = null;
    try {
        Class<? extends BlockFilter> blockFilterClass = job.getClass(SpatialSite.FilterClass, null,
                BlockFilter.class);
        if (blockFilterClass != null) {
            // Get all blocks the user wants to process
            blockFilter = blockFilterClass.newInstance();
            blockFilter.configure(job);
        }
    } catch (InstantiationException e1) {
        e1.printStackTrace();
    } catch (IllegalAccessException e1) {
        e1.printStackTrace();
    }

    if (blockFilter != null) {
        // Extract global indexes from input files

        for (int i_file = 0; i_file < inputFiles.length; i_file++) {
            FileSystem fs = inputFiles[i_file].getFileSystem(job);
            gIndexes[i_file] = SpatialSite.getGlobalIndex(fs, inputFiles[i_file]);
        }
    }

    final Vector<CombineFileSplit> matchedSplits = new Vector<CombineFileSplit>();
    if (gIndexes[0] == null || gIndexes[1] == null) {
        // Join every possible pair (Cartesian product)
        BlockLocation[][] fileBlockLocations = new BlockLocation[inputFiles.length][];
        for (int i_file = 0; i_file < inputFiles.length; i_file++) {
            FileSystem fs = inputFiles[i_file].getFileSystem(job);
            FileStatus fileStatus = fs.getFileStatus(inputFiles[i_file]);
            fileBlockLocations[i_file] = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        }
        LOG.info("Doing a Cartesian product of blocks: " + fileBlockLocations[0].length + "x"
                + fileBlockLocations[1].length);
        for (BlockLocation block1 : fileBlockLocations[0]) {
            for (BlockLocation block2 : fileBlockLocations[1]) {
                FileSplit fsplit1 = new FileSplit(inputFiles[0], block1.getOffset(), block1.getLength(),
                        block1.getHosts());
                FileSplit fsplit2 = new FileSplit(inputFiles[1], block2.getOffset(), block2.getLength(),
                        block2.getHosts());
                CombineFileSplit combinedSplit = (CombineFileSplit) FileSplitUtil.combineFileSplits(job,
                        fsplit1, fsplit2);
                matchedSplits.add(combinedSplit);
            }
        }
    } else {
        // Filter block pairs by the BlockFilter
        blockFilter.selectCellPairs(gIndexes[0], gIndexes[1], new ResultCollector2<Partition, Partition>() {
            @Override
            public void collect(Partition p1, Partition p2) {
                try {
                    List<FileSplit> splits1 = new ArrayList<FileSplit>();
                    Path path1 = new Path(inputFiles[0], p1.filename);
                    splitFile(job, path1, splits1);

                    List<FileSplit> splits2 = new ArrayList<FileSplit>();
                    Path path2 = new Path(inputFiles[1], p2.filename);
                    splitFile(job, path2, splits2);

                    for (FileSplit split1 : splits1) {
                        for (FileSplit split2 : splits2) {
                            matchedSplits.add(
                                    (CombineFileSplit) FileSplitUtil.combineFileSplits(job, split1, split2));
                        }
                    }

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }

    LOG.info("Matched " + matchedSplits.size() + " combine splits");

    // Return all matched splits
    return matchedSplits.toArray(new InputSplit[matchedSplits.size()]);
}

From source file:com.ricemap.spateDB.mapred.IndexedPrism.java

License:Apache License

public void splitFile(JobConf job, Path path, List<FileSplit> splits) throws IOException {
    NetworkTopology clusterMap = new NetworkTopology();
    FileSystem fs = path.getFileSystem(job);
    FileStatus file = fs.getFileStatus(path);
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long splitSize = blockSize;

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
            splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts));
            bytesRemaining -= splitSize;
        }// w  ww  .j  av a  2 s  . c o  m

        if (bytesRemaining != 0) {
            splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts()));
        }
    } else if (length != 0) {
        String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap);
        splits.add(new FileSplit(path, 0, length, splitHosts));
    } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    }
}

From source file:com.ricemap.spateDB.mapred.FileSplitUtil.java

License:Apache License

/**
 * Combines a number of input splits into the given numSplits.
 * @param conf/*from  w w  w.j  a  va  2s . co  m*/
 * @param inputSplits
 * @param numSplits
 * @return
 * @throws IOException 
 */
public static InputSplit[] autoCombineSplits(JobConf conf, Vector<FileSplit> inputSplits, int numSplits)
        throws IOException {
    LOG.info("Combining " + inputSplits.size() + " splits into " + numSplits);
    Map<String, Vector<FileSplit>> blocksPerHost = new HashMap<String, Vector<FileSplit>>();
    for (FileSplit fsplit : inputSplits) {
        // Get locations for this split
        final Path path = fsplit.getPath();
        final FileSystem fs = path.getFileSystem(conf);
        BlockLocation[] blockLocations = fs.getFileBlockLocations(fs.getFileStatus(path), fsplit.getStart(),
                fsplit.getLength());
        for (BlockLocation blockLocation : blockLocations) {
            for (String hostName : blockLocation.getHosts()) {
                if (!blocksPerHost.containsKey(hostName))
                    blocksPerHost.put(hostName, new Vector<FileSplit>());
                blocksPerHost.get(hostName).add(fsplit);
            }
        }
    }

    // If the user requested a fewer number of splits, start to combine them
    InputSplit[] combined_splits = new InputSplit[numSplits];
    int splitsAvailable = inputSplits.size();

    for (int i = 0; i < numSplits; i++) {
        // Decide how many splits to combine
        int numSplitsToCombine = splitsAvailable / (numSplits - i);
        Vector<FileSplit> splitsToCombine = new Vector<FileSplit>();
        while (numSplitsToCombine > 0) {
            // Choose the host with minimum number of splits
            Map.Entry<String, Vector<FileSplit>> minEntry = null;
            for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) {
                if (minEntry == null || entry.getValue().size() < minEntry.getValue().size()) {
                    minEntry = entry;
                }
            }
            // Combine all or some of blocks in this host
            for (FileSplit fsplit : minEntry.getValue()) {
                if (!splitsToCombine.contains(fsplit)) {
                    splitsToCombine.add(fsplit);
                    if (--numSplitsToCombine == 0)
                        break;
                }
            }
            if (numSplitsToCombine != 0) {
                // Remove this host so that it is not selected again
                blocksPerHost.remove(minEntry.getKey());
            }
        }

        combined_splits[i] = combineFileSplits(conf, splitsToCombine, 0, splitsToCombine.size());

        for (Map.Entry<String, Vector<FileSplit>> entry : blocksPerHost.entrySet()) {
            entry.getValue().removeAll(splitsToCombine);
        }
        splitsAvailable -= splitsToCombine.size();
    }

    LOG.info("Combined splits " + combined_splits.length);
    return combined_splits;
}

From source file:com.ricemap.spateDB.util.ReadFile.java

License:Apache License

public static void main(String[] args) throws Exception {
    CommandLineArguments cla = new CommandLineArguments(args);
    Path input = cla.getPath();//from w  w  w .  ja  v a 2  s. c om
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getOffset() == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getOffset(), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}