Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Read all categories from the category file
 * @param categoryFile//ww w.ja va 2 s .  com
 * @param categoryShapes
 * @param idToCategory
 * @throws IOException
 */
private static void readCategories(Path categoryFile, Map<Integer, Integer> idToCategory) throws IOException {
    Map<Integer, String> idToCatName = new HashMap<Integer, String>();
    FileSystem fsCategory = FileSystem.getLocal(new Configuration());
    long categoryFileSize = fsCategory.getFileStatus(categoryFile).getLen();
    if (categoryFileSize > 1024 * 1024)
        LOG.warn("Category file size is big: " + categoryFileSize);
    InputStream inCategory = fsCategory.open(categoryFile);
    LineRecordReader lineReader = new LineRecordReader(inCategory, 0, categoryFileSize, new Configuration());
    LongWritable lineOffset = lineReader.createKey();
    Text line = lineReader.createValue();

    Set<String> catNames = new TreeSet<String>();

    while (lineReader.next(lineOffset, line)) {
        int shape_id = TextSerializerHelper.consumeInt(line, ',');
        String cat_name = line.toString();
        catNames.add(cat_name);
        idToCatName.put(shape_id, cat_name);
    }

    lineReader.close();

    // Change category names to numbers
    Map<String, Integer> cat_name_to_id = new HashMap<String, Integer>();
    int cat_id = 0;
    for (String cat_name : catNames) {
        cat_name_to_id.put(cat_name, cat_id++);
    }

    for (Map.Entry<Integer, String> entry : idToCatName.entrySet()) {
        idToCategory.put(entry.getKey(), cat_name_to_id.get(entry.getValue()));
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java

License:Open Source License

/**
 * @param args/*from  w w w.  j  a v  a 2  s .  c o  m*/
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    GenericOptionsParser parser = new GenericOptionsParser(args);
    OperationsParams params = new OperationsParams(parser);
    if (args.length == 0) {
        printUsage();
        throw new RuntimeException("Illegal arguments. Input file missing");
    }
    Path inputFile = new Path(args[0]);
    FileSystem fs = inputFile.getFileSystem(new Configuration());
    if (!fs.exists(inputFile)) {
        printUsage();
        throw new RuntimeException("Input file does not exist");
    }
    params.setClass("shape", Point.class, Shape.class);
    samplePoint(fs, inputFile);
    final long fileSize = fs.getFileStatus(inputFile).getLen();
    long delta = (long) (1.0 * sample.size() / (1.0 * fileSize / localMemory));
    if (delta == 0)
        delta = 1;
    System.out.println("delta = " + delta);
    Vector<Point> axis = new Vector<Point>();
    for (int i = 0; i < sample.size(); i += delta)
        axis.add(sample.get(i));
    sample = axis;

    System.out.println("Finish Sampling.");
    cloesetPair(inputFile, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Contains.java

License:Open Source License

public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Contains.class);

    LOG.info("Contains journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from www . j  a v a  2s  . c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(ContainsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(ContainsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java

License:Open Source License

public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Crosses.class);

    LOG.info("Crosses journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w  ww .  j  a  v  a 2 s  .c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Crosses");
    job.setMapperClass(CrossesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(CrossesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java

License:Open Source License

public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Disjoint.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   www .java 2 s  .  co  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Disjoint");
    job.setMapperClass(DisjointMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(DisjointReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
* Select a file to repartition based on some heuristics. If only one file is
* indexed, the non-indexed file is repartitioned. If both files are indexed,
* the smaller file is repartitioned.//from w w  w. ja  va 2 s .c o  m
* 
* @param files
* @param params
* @return the index in the given array of the file to be repartitioned. -1 if
*         all files are non-indexed
* @throws IOException
*/
protected static int selectRepartition(final Path[] files, OperationsParams params) throws IOException {
    int largest_partitioned_file = -1;
    long largest_size = 0;

    for (int i_file = 0; i_file < files.length; i_file++) {
        FileSystem fs = files[i_file].getFileSystem(params);
        GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, files[i_file]);
        if (gindex != null) {
            // Compute total size (all files in directory)
            long total_size = 0;
            for (Partition p : gindex) {
                Path file = new Path(files[i_file], p.filename);
                total_size += fs.getFileStatus(file).getLen();
            }
            if (total_size > largest_size) {
                largest_partitioned_file = i_file;
                largest_size = total_size;
            }
        }
    }
    return largest_partitioned_file == -1 ? -1 : 1 - largest_partitioned_file;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two files./*from www .j a v  a2 s .  c o m*/
 * @param inputFiles
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public static long distributedJoinSmart(final Path[] inputFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    Path[] originalInputFiles = inputFiles.clone();
    FileSystem outFs = inputFiles[0].getFileSystem(params);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }

    // Decide whether to do a repartition step or not
    int cost_with_repartition, cost_without_repartition;
    final FileStatus[] fStatus = new FileStatus[inputFiles.length];
    for (int i_file = 0; i_file < inputFiles.length; i_file++) {
        // TODO work with folders. Calculate size more accurately
        FileSystem fs = inputFiles[i_file].getFileSystem(params);
        fStatus[i_file] = fs.getFileStatus(inputFiles[i_file]);
    }

    // Sort files by length (size)
    IndexedSortable filesBySize = new IndexedSortable() {
        @Override
        public void swap(int i, int j) {
            Path tmp1 = inputFiles[i];
            inputFiles[i] = inputFiles[j];
            inputFiles[j] = tmp1;

            FileStatus tmp2 = fStatus[i];
            fStatus[i] = fStatus[j];
            fStatus[j] = tmp2;
        }

        @Override
        public int compare(int i, int j) {
            if (fStatus[i].getLen() < fStatus[j].getLen())
                return 0;
            return fStatus[i].getLen() < fStatus[j].getLen() ? -1 : 1;
        }
    };

    new QuickSort().sort(filesBySize, 0, inputFiles.length);
    GlobalIndex<Partition>[] gIndexes = new GlobalIndex[fStatus.length];
    int[] numBlocks = new int[fStatus.length];
    for (int i_file = 0; i_file < fStatus.length; i_file++) {
        gIndexes[i_file] = SpatialSite.getGlobalIndex(outFs, fStatus[i_file].getPath());
        if (gIndexes[i_file] != null) {
            // Number of blocks is equal to number of partitions in global
            // index
            numBlocks[i_file] = gIndexes[i_file].size();
        } else if (fStatus[i_file].isDir()) {
            // Add up number of file system blocks in all subfiles of this
            // directory
            numBlocks[i_file] = 0;
            FileStatus[] subfiles = outFs.listStatus(inputFiles[i_file], SpatialSite.NonHiddenFileFilter);
            for (FileStatus subfile : subfiles) {
                numBlocks[i_file] += outFs.getFileBlockLocations(subfile, 0, subfile.getLen()).length;
            }
        } else {
            // Number of file system blocks in input file
            numBlocks[i_file] = outFs.getFileBlockLocations(fStatus[i_file], 0,
                    fStatus[i_file].getLen()).length;
        }
    }

    cost_without_repartition = gIndexes[0] != null && gIndexes[1] != null
            ? GlobalIndex.spatialJoin(gIndexes[0], gIndexes[1], null)
            : (numBlocks[0] * numBlocks[1]);
    // Total cost = Cost of repartition (=== 2 * numBlocks[0]) +
    // cost of join (=== numBlocks[0] + numBlocks[1])
    cost_with_repartition = numBlocks[0] * 3 + numBlocks[1];
    LOG.info("Cost with repartition is estimated to " + cost_with_repartition);
    LOG.info("Cost without repartition is estimated to " + cost_without_repartition);
    boolean need_repartition = cost_with_repartition < cost_without_repartition;
    if (need_repartition) {
        int file_to_repartition = selectRepartition(inputFiles, params);
        repartitionStep(inputFiles, file_to_repartition, params);
    }

    // Restore inputFiles to the original order by user
    if (inputFiles[1] != originalInputFiles[1]) {
        Path temp = inputFiles[0];
        inputFiles[0] = inputFiles[1];
        inputFiles[1] = temp;
    }

    // Redistribute join the larger file and the partitioned file
    long result_size = DistributedJoin.joinStep(inputFiles, outputPath, params);

    if (userOutputPath == null)
        outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.umn.cs.spatialHadoop.operations.Equals.java

License:Open Source License

public static <S extends Shape> long equals(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Equals.class);

    LOG.info("Equals journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   w  w w . ja va  2 s.  c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Equals");
    job.setMapperClass(EqualsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(EqualsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java

License:Open Source License

public static Partition fileMBRLocal(Path[] inFiles, final OperationsParams params)
        throws IOException, InterruptedException {
    // 1- Split the input path/file to get splits that can be processed independently
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    Job job = Job.getInstance(params);//  www  .  ja  v  a  2s .c om
    SpatialInputFormat3.setInputPaths(job, inFiles);
    final List<org.apache.hadoop.mapreduce.InputSplit> splits = inputFormat.getSplits(job);
    int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors());

    // 2- Process splits in parallel
    List<Map<String, Partition>> allMbrs = Parallel.forEach(splits.size(),
            new RunnableRange<Map<String, Partition>>() {
                @Override
                public Map<String, Partition> run(int i1, int i2) {
                    Map<String, Partition> mbrs = new HashMap<String, Partition>();
                    for (int i = i1; i < i2; i++) {
                        try {
                            org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits
                                    .get(i);
                            final RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat
                                    .createRecordReader(fsplit, null);
                            if (reader instanceof SpatialRecordReader3) {
                                ((SpatialRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof RTreeRecordReader3) {
                                ((RTreeRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof HDFRecordReader) {
                                ((HDFRecordReader) reader).initialize(fsplit, params);
                            } else {
                                throw new RuntimeException("Unknown record reader");
                            }
                            Partition p = mbrs.get(fsplit.getPath().getName());
                            if (p == null) {
                                p = new Partition();
                                p.filename = fsplit.getPath().getName();
                                p.cellId = p.filename.hashCode();
                                p.size = 0;
                                p.recordCount = 0;
                                p.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
                                mbrs.put(p.filename, p);
                            }
                            Text temp = new Text2();
                            while (reader.nextKeyValue()) {
                                Iterable<Shape> shapes = reader.getCurrentValue();
                                for (Shape s : shapes) {
                                    Rectangle mbr = s.getMBR();
                                    if (mbr != null)
                                        p.expand(mbr);
                                    p.recordCount++;
                                    temp.clear();
                                    s.toText(temp);
                                    p.size += temp.getLength() + 1;
                                }
                            }
                        } catch (IOException e) {
                            throw new RuntimeException(e);
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                    }
                    return mbrs;
                }
            }, parallelism);
    Map<String, Partition> mbrs = allMbrs.remove(allMbrs.size() - 1);
    for (Map<String, Partition> list : allMbrs) {
        for (Partition p1 : list.values()) {
            Partition p2 = mbrs.get(p1.filename);
            if (p2 != null) {
                p2.expand(p1);
            } else {
                mbrs.put(p1.filename, p1);
            }
        }
    }

    // Cache the final result, if needed
    for (Path inFile : inFiles) {
        FileSystem inFs = inFile.getFileSystem(params);
        if (!inFs.getFileStatus(inFile).isDir())
            continue;
        Path gindex_path = new Path(inFile, "_master.heap");
        // Answer has been already cached (may be by another job)
        if (inFs.exists(gindex_path))
            continue;
        FileStatus[] files = inFs.listStatus(inFile, SpatialSite.NonHiddenFileFilter);
        PrintStream wktout = new PrintStream(inFs.create(new Path(inFile, "_heap.wkt"), false));
        PrintStream gout = new PrintStream(inFs.create(gindex_path, false));

        Text text = new Text2();
        for (FileStatus file : files) {
            text.clear();
            Partition p = mbrs.get(file.getPath().getName());
            gout.println(p.toText(text).toString());
            wktout.println(p.toWKT());
        }

        wktout.close();
        gout.close();
    }

    // Return the final answer
    Partition finalResult = new Partition();
    finalResult.size = finalResult.recordCount = 0;
    finalResult.x1 = finalResult.y1 = Double.MAX_VALUE;
    finalResult.x2 = finalResult.y2 = -Double.MAX_VALUE;
    for (Partition p2 : mbrs.values())
        finalResult.expand(p2);
    return finalResult;
}

From source file:edu.umn.cs.spatialHadoop.operations.GeometricPlot.java

License:Open Source License

/**
 * Combines images of different datasets into one image that is displayed
 * to users.//  www.j  av a 2 s.  com
 * This method is called from the web interface to display one image for
 * multiple selected datasets.
 * @param fs The file system that contains the datasets and images
 * @param files Paths to directories which contains the datasets
 * @param includeBoundaries Also plot the indexing boundaries of datasets
 * @return An image that is the combination of all datasets images
 * @throws IOException
 * @throws InterruptedException 
 */
public static BufferedImage combineImages(Configuration conf, Path[] files, boolean includeBoundaries,
        int width, int height) throws IOException, InterruptedException {
    BufferedImage result = null;
    // Retrieve the MBRs of all datasets
    Rectangle allMbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : files) {
        Rectangle mbr = FileMBR.fileMBR(file, new OperationsParams(conf));
        allMbr.expand(mbr);
    }

    // Adjust width and height to maintain aspect ratio
    if ((allMbr.x2 - allMbr.x1) / (allMbr.y2 - allMbr.y1) > (double) width / height) {
        // Fix width and change height
        height = (int) ((allMbr.y2 - allMbr.y1) * width / (allMbr.x2 - allMbr.x1));
    } else {
        width = (int) ((allMbr.x2 - allMbr.x1) * height / (allMbr.y2 - allMbr.y1));
    }
    result = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);

    for (Path file : files) {
        FileSystem fs = file.getFileSystem(conf);
        if (fs.getFileStatus(file).isDir()) {
            // Retrieve the MBR of this dataset
            Rectangle mbr = FileMBR.fileMBR(file, new OperationsParams(conf));
            // Compute the coordinates of this image in the whole picture
            mbr.x1 = (mbr.x1 - allMbr.x1) * width / allMbr.getWidth();
            mbr.x2 = (mbr.x2 - allMbr.x1) * width / allMbr.getWidth();
            mbr.y1 = (mbr.y1 - allMbr.y1) * height / allMbr.getHeight();
            mbr.y2 = (mbr.y2 - allMbr.y1) * height / allMbr.getHeight();
            // Retrieve the image of this dataset
            Path imagePath = new Path(file, "_data.png");
            if (!fs.exists(imagePath))
                throw new RuntimeException("Image " + imagePath + " not ready");
            FSDataInputStream imageFile = fs.open(imagePath);
            BufferedImage image = ImageIO.read(imageFile);
            imageFile.close();
            // Draw the image
            Graphics graphics = result.getGraphics();
            graphics.drawImage(image, (int) mbr.x1, (int) mbr.y1, (int) mbr.getWidth(), (int) mbr.getHeight(),
                    null);
            graphics.dispose();

            if (includeBoundaries) {
                // Plot also the image of the boundaries
                // Retrieve the image of the dataset boundaries
                imagePath = new Path(file, "_partitions.png");
                if (fs.exists(imagePath)) {
                    imageFile = fs.open(imagePath);
                    image = ImageIO.read(imageFile);
                    imageFile.close();
                    // Draw the image
                    graphics = result.getGraphics();
                    graphics.drawImage(image, (int) mbr.x1, (int) mbr.y1, (int) mbr.getWidth(),
                            (int) mbr.getHeight(), null);
                    graphics.dispose();
                }
            }
        }
    }

    return result;
}