Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

@SuppressWarnings("deprecation")
public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, OperationsParams params)
        throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    long blockSize = params.getSize("blocksize");

    FileSystem inFs = inFile.getFileSystem(new Configuration());
    FileSystem outFs = outFile.getFileSystem(new Configuration());

    // Calculate number of partitions in output file
    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            // Copy blocksize from source file if it's globally indexed
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();/*w  w  w .  j a va2 s .c  o  m*/
        } else {
            // Use default block size for output file system
            blockSize = outFs.getDefaultBlockSize();
        }
    }

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cells;
    if (sindex.equals("grid")) {
        Rectangle input_mbr = FileMBR.fileMBR(inFile, params);
        long inFileSize = FileMBR.sizeOfLastProcessedFile;
        int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outFile,
                blockSize);

        GridInfo gridInfo = new GridInfo(input_mbr.x1, input_mbr.y1, input_mbr.x2, input_mbr.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cells = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
            || sindex.equals("str+")) {
        cells = packInRectangles(inFile, outFile, params);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    repartitionLocal(inFile, outFile, cells, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static int sampleWithRatio(Path[] files, final ResultCollector<? extends TextSerializable> output,
        OperationsParams params) throws IOException {
    FileSystem fs = files[0].getFileSystem(params);
    FileStatus inFStatus = fs.getFileStatus(files[0]);
    if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) {
        // Either a directory of file or a large file
        return sampleMapReduceWithRatio(files, output, params);
    } else {/*ww w.j a  v a  2 s  . c  o m*/
        // A single small file, process it without MapReduce
        return sampleLocalWithRatio(files, output, params);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleLocalWithRatio(Path[] files,
        final ResultCollector<T> output, OperationsParams params) throws IOException {
    long total_size = 0;
    // TODO handle compressed files
    // TODO Use a global index to determine the exact size quickly
    for (Path file : files) {
        FileSystem fs = file.getFileSystem(params);
        FileStatus fStatus = fs.getFileStatus(file);
        if (fStatus.isDir()) {
            // Go one level deeper
            for (FileStatus subFStatus : fs.listStatus(file)) {
                if (!subFStatus.isDir())
                    total_size += subFStatus.getLen();
            }/*from ww  w .j a v a  2 s . c o m*/
        } else {
            total_size += fStatus.getLen();
        }
    }
    sizeOfLastProcessedFile = total_size;
    float ratio = params.getFloat("ratio", 0.1f);
    params.setLong("size", (long) (total_size * ratio));
    return sampleLocalWithSize(files, output, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleLocalByCount(Path[] files, ResultCollector<T> output,
        OperationsParams params) throws IOException {

    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        FileSystem fs = file.getFileSystem(params);
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }//from  w  w w .  ja  va2  s .com
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    TextSerializable inObj1, outObj1;
    inObj1 = OperationsParams.getTextSerializable(params, "shape", new Text2());
    outObj1 = OperationsParams.getTextSerializable(params, "outshape", new Text2());

    // Make the objects final to be able to use in the anonymous inner class
    final TextSerializable inObj = inObj1;
    final T outObj = (T) outObj1;

    ResultCollector<TextSerializable> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        FileSystem fs = files[i_file].getFileSystem(params);
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(params.getLong("seed", System.currentTimeMillis()));
    long[] offsets = new long[params.getInt("count", 0)];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < offsets.length) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];
        FileSystem fs = files[file_i].getFileSystem(params);
        ShapeLineRecordReader reader = new ShapeLineRecordReader(fs.getConf(),
                new FileSplit(files[file_i], 0, current_file_size, new String[] {}));
        Rectangle key = reader.createKey();
        Text line = reader.createValue();
        long pos = files_start_offset[file_i];

        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1]
                && reader.next(key, line)) {
            pos += line.getLength();
            if (pos > offsets[record_i]) {
                // Passed the offset of record_i
                // Report this element to output
                if (converter != null) {
                    inObj.fromText(line);
                    converter.collect(inObj);
                }
                record_i++;
                records_returned++;
            }
        }
        reader.close();

        // Skip any remaining records that were supposed to be read from this file
        // This case might happen if a generated random position was in the middle
        // of the last line.
        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1])
            record_i++;
    }
    return records_returned;
}

From source file:edu.umn.cs.spatialHadoop.operations.SJMR.java

License:Open Source License

public static <S extends Shape> long sjmr(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, SJMR.class);

    LOG.info("SJMR journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w w w. jav  a 2s  .  co m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("SJMR");
    job.setMapperClass(SJMRMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(SJMRReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Touches.java

License:Open Source License

public static <S extends Shape> long touches(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Touches.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//from  w w w .  j  av  a 2s. co  m
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Touches");
    job.setMapperClass(TouchesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(TouchesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Within.java

License:Open Source License

public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Within.class);

    LOG.info("Within journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w w  w .  j a  va2  s.c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(WithinMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(WithinReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.osm.OSMToKML.java

License:Open Source License

/**
 * @param args//w  ww  . j  av  a  2s.c o m
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    final OperationsParams params = new OperationsParams(new GenericOptionsParser(args), false);
    if (!params.checkInputOutput()) {
        System.err.println("Please specify input and output");
        System.exit(1);
    }
    params.setClass("shape", OSMPolygon.class, Shape.class);
    Path inputPath = params.getInputPath();
    FileSystem inFs = inputPath.getFileSystem(params);
    ShapeArrayRecordReader in = new ShapeArrayRecordReader(params,
            new FileSplit(inputPath, 0, inFs.getFileStatus(inputPath).getLen(), new String[0]));
    Path outPath = params.getOutputPath();
    FileSystem outFs = outPath.getFileSystem(params);
    PrintWriter out;
    ZipOutputStream zipOut = null;
    if (outPath.getName().toLowerCase().endsWith(".kmz")) {
        // Create a KMZ file
        FSDataOutputStream kmzOut = outFs.create(outPath);
        zipOut = new ZipOutputStream(kmzOut);
        zipOut.putNextEntry(new ZipEntry("osm.kml"));
        out = new PrintWriter(zipOut);
    } else {
        out = new PrintWriter(outFs.create(outPath));
    }
    out.println("<?xml version='1.0' encoding='UTF-8'?>");
    out.println("<kml xmlns='http://www.opengis.net/kml/2.2'>");
    out.println("<Document>");
    writeAllStyles(out);
    Rectangle key = in.createKey();
    ArrayWritable values = in.createValue();
    while (in.next(key, values)) {
        System.out.println("Read " + values.get().length);
        for (Shape shape : (Shape[]) values.get()) {
            if (shape instanceof OSMPolygon) {
                out.println(OSMtoKMLElement((OSMPolygon) shape));
            }
        }
        out.println();
    }
    out.println("</Document>");
    out.println("</kml>");
    in.close();
    if (zipOut != null) {
        // KMZ file
        out.flush();
        zipOut.closeEntry();
        zipOut.close();
    } else {
        // KML file
        out.close();
    }
}

From source file:edu.umn.cs.spatialHadoop.ReadFile.java

License:Open Source License

public static void main(String[] args) throws Exception {
    OperationsParams cla = new OperationsParams(new GenericOptionsParser(args));
    Path input = cla.getPath();//from   ww w . j a va 2s  . c om
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getInt("offset", 0) == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}

From source file:edu.umn.cs.spatialHadoop.util.FileUtil.java

License:Open Source License

/**
 * Copies a part of a file from a remote file system (e.g., HDFS) to a local
 * file. Returns a path to a local temporary file.
 * /*from   w w w .ja v a  2  s.c  o  m*/
 * @param conf
 * @param split
 * @return
 * @throws IOException
 */
public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException {
    FileSystem fs = split.getPath().getFileSystem(conf);

    // Special case of a local file. Skip copying the file
    if (fs instanceof LocalFileSystem && split.getStart() == 0)
        return split.getPath().toUri().getPath();

    File destFile = File.createTempFile(split.getPath().getName(), "tmp");
    // Special handling for HTTP files for more efficiency
    /*if (fs instanceof HTTPFileSystem && split.getStart() == 0) {
      URL website = split.getPath().toUri().toURL();
      ReadableByteChannel rbc = Channels.newChannel(website.openStream());
      FileOutputStream fos = new FileOutputStream(destFile);
      fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
      fos.close();
      return destFile.getAbsolutePath();
    }*/

    // Length of input file. We do not depend on split.length because it is
    // not
    // set by input format for performance reason. Setting it in the input
    // format would cost a lot of time because it runs on the client machine
    // while the record reader runs on slave nodes in parallel
    long length = fs.getFileStatus(split.getPath()).getLen();

    FSDataInputStream in = fs.open(split.getPath());
    in.seek(split.getStart());
    ReadableByteChannel rbc = Channels.newChannel(in);

    // Prepare output file for write
    FileOutputStream out = new FileOutputStream(destFile);

    out.getChannel().transferFrom(rbc, 0, length);

    in.close();
    out.close();
    return destFile.getAbsolutePath();
}