Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

@SuppressWarnings("deprecation")
public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, OperationsParams params)
        throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    long blockSize = params.getSize("blocksize");

    FileSystem inFs = inFile.getFileSystem(new Configuration());
    FileSystem outFs = outFile.getFileSystem(new Configuration());

    // Calculate number of partitions in output file
    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            // Copy blocksize from source file if it's globally indexed
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();/*w  w  w .  j a va2 s .c  o  m*/
        } else {
            // Use default block size for output file system
            blockSize = outFs.getDefaultBlockSize();
        }
    }

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cells;
    if (sindex.equals("grid")) {
        Rectangle input_mbr = FileMBR.fileMBR(inFile, params);
        long inFileSize = FileMBR.sizeOfLastProcessedFile;
        int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outFile,
                blockSize);

        GridInfo gridInfo = new GridInfo(input_mbr.x1, input_mbr.y1, input_mbr.x2, input_mbr.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cells = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
            || sindex.equals("str+")) {
        cells = packInRectangles(inFile, outFile, params);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    repartitionLocal(inFile, outFile, cells, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static int sampleWithRatio(Path[] files, final ResultCollector<? extends TextSerializable> output,
        OperationsParams params) throws IOException {
    FileSystem fs = files[0].getFileSystem(params);
    FileStatus inFStatus = fs.getFileStatus(files[0]);
    if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) {
        // Either a directory of file or a large file
        return sampleMapReduceWithRatio(files, output, params);
    } else {/*ww w.j a  v a  2 s  . c  o m*/
        // A single small file, process it without MapReduce
        return sampleLocalWithRatio(files, output, params);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleLocalWithRatio(Path[] files,
        final ResultCollector<T> output, OperationsParams params) throws IOException {
    long total_size = 0;
    // TODO handle compressed files
    // TODO Use a global index to determine the exact size quickly
    for (Path file : files) {
        FileSystem fs = file.getFileSystem(params);
        FileStatus fStatus = fs.getFileStatus(file);
        if (fStatus.isDir()) {
            // Go one level deeper
            for (FileStatus subFStatus : fs.listStatus(file)) {
                if (!subFStatus.isDir())
                    total_size += subFStatus.getLen();
            }/*from ww  w .j a v a  2 s . c o m*/
        } else {
            total_size += fStatus.getLen();
        }
    }
    sizeOfLastProcessedFile = total_size;
    float ratio = params.getFloat("ratio", 0.1f);
    params.setLong("size", (long) (total_size * ratio));
    return sampleLocalWithSize(files, output, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleLocalByCount(Path[] files, ResultCollector<T> output,
        OperationsParams params) throws IOException {

    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        FileSystem fs = file.getFileSystem(params);
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }//from  w  w w .  ja  va2  s .com
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    TextSerializable inObj1, outObj1;
    inObj1 = OperationsParams.getTextSerializable(params, "shape", new Text2());
    outObj1 = OperationsParams.getTextSerializable(params, "outshape", new Text2());

    // Make the objects final to be able to use in the anonymous inner class
    final TextSerializable inObj = inObj1;
    final T outObj = (T) outObj1;

    ResultCollector<TextSerializable> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        FileSystem fs = files[i_file].getFileSystem(params);
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(params.getLong("seed", System.currentTimeMillis()));
    long[] offsets = new long[params.getInt("count", 0)];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < offsets.length) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];
        FileSystem fs = files[file_i].getFileSystem(params);
        ShapeLineRecordReader reader = new ShapeLineRecordReader(fs.getConf(),
                new FileSplit(files[file_i], 0, current_file_size, new String[] {}));
        Rectangle key = reader.createKey();
        Text line = reader.createValue();
        long pos = files_start_offset[file_i];

        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1]
                && reader.next(key, line)) {
            pos += line.getLength();
            if (pos > offsets[record_i]) {
                // Passed the offset of record_i
                // Report this element to output
                if (converter != null) {
                    inObj.fromText(line);
                    converter.collect(inObj);
                }
                record_i++;
                records_returned++;
            }
        }
        reader.close();

        // Skip any remaining records that were supposed to be read from this file
        // This case might happen if a generated random position was in the middle
        // of the last line.
        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1])
            record_i++;
    }
    return records_returned;
}

From source file:edu.umn.cs.spatialHadoop.operations.SJMR.java

License:Open Source License

public static <S extends Shape> long sjmr(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, SJMR.class);

    LOG.info("SJMR journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w w w. jav  a 2s  .  co m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("SJMR");
    job.setMapperClass(SJMRMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(SJMRReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Touches.java

License:Open Source License

public static <S extends Shape> long touches(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Touches.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//from  w w w .  j  av  a 2s. co  m
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Touches");
    job.setMapperClass(TouchesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(TouchesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Within.java

License:Open Source License

public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Within.class);

    LOG.info("Within journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w w  w .  j a  va2  s.c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(WithinMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(WithinReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.osm.OSMToKML.java

License:Open Source License

/**
 * @param args//w  ww  . j  av  a  2s.c o m
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    final OperationsParams params = new OperationsParams(new GenericOptionsParser(args), false);
    if (!params.checkInputOutput()) {
        System.err.println("Please specify input and output");
        System.exit(1);
    }
    params.setClass("shape", OSMPolygon.class, Shape.class);
    Path inputPath = params.getInputPath();
    FileSystem inFs = inputPath.getFileSystem(params);
    ShapeArrayRecordReader in = new ShapeArrayRecordReader(params,
            new FileSplit(inputPath, 0, inFs.getFileStatus(inputPath).getLen(), new String[0]));
    Path outPath = params.getOutputPath();
    FileSystem outFs = outPath.getFileSystem(params);
    PrintWriter out;
    ZipOutputStream zipOut = null;
    if (outPath.getName().toLowerCase().endsWith(".kmz")) {
        // Create a KMZ file
        FSDataOutputStream kmzOut = outFs.create(outPath);
        zipOut = new ZipOutputStream(kmzOut);
        zipOut.putNextEntry(new ZipEntry("osm.kml"));
        out = new PrintWriter(zipOut);
    } else {
        out = new PrintWriter(outFs.create(outPath));
    }
    out.println("<?xml version='1.0' encoding='UTF-8'?>");
    out.println("<kml xmlns='http://www.opengis.net/kml/2.2'>");
    out.println("<Document>");
    writeAllStyles(out);
    Rectangle key = in.createKey();
    ArrayWritable values = in.createValue();
    while (in.next(key, values)) {
        System.out.println("Read " + values.get().length);
        for (Shape shape : (Shape[]) values.get()) {
            if (shape instanceof OSMPolygon) {
                out.println(OSMtoKMLElement((OSMPolygon) shape));
            }
        }
        out.println();
    }
    out.println("</Document>");
    out.println("</kml>");
    in.close();
    if (zipOut != null) {
        // KMZ file
        out.flush();
        zipOut.closeEntry();
        zipOut.close();
    } else {
        // KML file
        out.close();
    }
}

From source file:edu.umn.cs.spatialHadoop.ReadFile.java

License:Open Source License

public static void main(String[] args) throws Exception {
    OperationsParams cla = new OperationsParams(new GenericOptionsParser(args));
    Path input = cla.getPath();//from   ww w . j a va 2s  . c om
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getInt("offset", 0) == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.util.FileUtil.java

License:Open Source License

/**
 * Copies a part of a file from a remote file system (e.g., HDFS) to a local
 * file. Returns a path to a local temporary file.
 * /*from   w w w .ja v a  2  s.c  o  m*/
 * @param conf
 * @param split
 * @return
 * @throws IOException
 */
public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException {
    FileSystem fs = split.getPath().getFileSystem(conf);

    // Special case of a local file. Skip copying the file
    if (fs instanceof LocalFileSystem && split.getStart() == 0)
        return split.getPath().toUri().getPath();

    File destFile = File.createTempFile(split.getPath().getName(), "tmp");
    // Special handling for HTTP files for more efficiency
    /*if (fs instanceof HTTPFileSystem && split.getStart() == 0) {
      URL website = split.getPath().toUri().toURL();
      ReadableByteChannel rbc = Channels.newChannel(website.openStream());
      FileOutputStream fos = new FileOutputStream(destFile);
      fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
      fos.close();
      return destFile.getAbsolutePath();
    }*/

    // Length of input file. We do not depend on split.length because it is
    // not
    // set by input format for performance reason. Setting it in the input
    // format would cost a lot of time because it runs on the client machine
    // while the record reader runs on slave nodes in parallel
    long length = fs.getFileStatus(split.getPath()).getLen();

    FSDataInputStream in = fs.open(split.getPath());
    in.seek(split.getStart());
    ReadableByteChannel rbc = Channels.newChannel(in);

    // Prepare output file for write
    FileOutputStream out = new FileOutputStream(destFile);

    out.getChannel().transferFrom(rbc, 0, length);

    in.close();
    out.close();
    return destFile.getAbsolutePath();
}