Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Read all categories from the category file
 * @param categoryFile//ww w.ja va 2 s .  com
 * @param categoryShapes
 * @param idToCategory
 * @throws IOException
 */
private static void readCategories(Path categoryFile, Map<Integer, Integer> idToCategory) throws IOException {
    Map<Integer, String> idToCatName = new HashMap<Integer, String>();
    FileSystem fsCategory = FileSystem.getLocal(new Configuration());
    long categoryFileSize = fsCategory.getFileStatus(categoryFile).getLen();
    if (categoryFileSize > 1024 * 1024)
        LOG.warn("Category file size is big: " + categoryFileSize);
    InputStream inCategory = fsCategory.open(categoryFile);
    LineRecordReader lineReader = new LineRecordReader(inCategory, 0, categoryFileSize, new Configuration());
    LongWritable lineOffset = lineReader.createKey();
    Text line = lineReader.createValue();

    Set<String> catNames = new TreeSet<String>();

    while (lineReader.next(lineOffset, line)) {
        int shape_id = TextSerializerHelper.consumeInt(line, ',');
        String cat_name = line.toString();
        catNames.add(cat_name);
        idToCatName.put(shape_id, cat_name);
    }

    lineReader.close();

    // Change category names to numbers
    Map<String, Integer> cat_name_to_id = new HashMap<String, Integer>();
    int cat_id = 0;
    for (String cat_name : catNames) {
        cat_name_to_id.put(cat_name, cat_id++);
    }

    for (Map.Entry<Integer, String> entry : idToCatName.entrySet()) {
        idToCategory.put(entry.getKey(), cat_name_to_id.get(entry.getValue()));
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.ClosestPairHadoop.java

License:Open Source License

/**
 * @param args/*from  w w w.  j  a v  a 2  s .  c o  m*/
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    GenericOptionsParser parser = new GenericOptionsParser(args);
    OperationsParams params = new OperationsParams(parser);
    if (args.length == 0) {
        printUsage();
        throw new RuntimeException("Illegal arguments. Input file missing");
    }
    Path inputFile = new Path(args[0]);
    FileSystem fs = inputFile.getFileSystem(new Configuration());
    if (!fs.exists(inputFile)) {
        printUsage();
        throw new RuntimeException("Input file does not exist");
    }
    params.setClass("shape", Point.class, Shape.class);
    samplePoint(fs, inputFile);
    final long fileSize = fs.getFileStatus(inputFile).getLen();
    long delta = (long) (1.0 * sample.size() / (1.0 * fileSize / localMemory));
    if (delta == 0)
        delta = 1;
    System.out.println("delta = " + delta);
    Vector<Point> axis = new Vector<Point>();
    for (int i = 0; i < sample.size(); i += delta)
        axis.add(sample.get(i));
    sample = axis;

    System.out.println("Finish Sampling.");
    cloesetPair(inputFile, params);
}

From source file:edu.umn.cs.spatialHadoop.operations.Contains.java

License:Open Source License

public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Contains.class);

    LOG.info("Contains journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from www . j  a v a  2s  . c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(ContainsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(ContainsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java

License:Open Source License

public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Crosses.class);

    LOG.info("Crosses journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  w  ww .  j  a  v  a 2 s  .c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Crosses");
    job.setMapperClass(CrossesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(CrossesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java

License:Open Source License

public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Disjoint.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   www .java 2 s  .  co  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Disjoint");
    job.setMapperClass(DisjointMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(DisjointReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
* Select a file to repartition based on some heuristics. If only one file is
* indexed, the non-indexed file is repartitioned. If both files are indexed,
* the smaller file is repartitioned.//from w w  w. ja  va 2 s .c o  m
* 
* @param files
* @param params
* @return the index in the given array of the file to be repartitioned. -1 if
*         all files are non-indexed
* @throws IOException
*/
protected static int selectRepartition(final Path[] files, OperationsParams params) throws IOException {
    int largest_partitioned_file = -1;
    long largest_size = 0;

    for (int i_file = 0; i_file < files.length; i_file++) {
        FileSystem fs = files[i_file].getFileSystem(params);
        GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, files[i_file]);
        if (gindex != null) {
            // Compute total size (all files in directory)
            long total_size = 0;
            for (Partition p : gindex) {
                Path file = new Path(files[i_file], p.filename);
                total_size += fs.getFileStatus(file).getLen();
            }
            if (total_size > largest_size) {
                largest_partitioned_file = i_file;
                largest_size = total_size;
            }
        }
    }
    return largest_partitioned_file == -1 ? -1 : 1 - largest_partitioned_file;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two files./*from www .j a v  a2 s .  c o m*/
 * @param inputFiles
 * @param userOutputPath
 * @param params
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public static long distributedJoinSmart(final Path[] inputFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    Path[] originalInputFiles = inputFiles.clone();
    FileSystem outFs = inputFiles[0].getFileSystem(params);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }

    // Decide whether to do a repartition step or not
    int cost_with_repartition, cost_without_repartition;
    final FileStatus[] fStatus = new FileStatus[inputFiles.length];
    for (int i_file = 0; i_file < inputFiles.length; i_file++) {
        // TODO work with folders. Calculate size more accurately
        FileSystem fs = inputFiles[i_file].getFileSystem(params);
        fStatus[i_file] = fs.getFileStatus(inputFiles[i_file]);
    }

    // Sort files by length (size)
    IndexedSortable filesBySize = new IndexedSortable() {
        @Override
        public void swap(int i, int j) {
            Path tmp1 = inputFiles[i];
            inputFiles[i] = inputFiles[j];
            inputFiles[j] = tmp1;

            FileStatus tmp2 = fStatus[i];
            fStatus[i] = fStatus[j];
            fStatus[j] = tmp2;
        }

        @Override
        public int compare(int i, int j) {
            if (fStatus[i].getLen() < fStatus[j].getLen())
                return 0;
            return fStatus[i].getLen() < fStatus[j].getLen() ? -1 : 1;
        }
    };

    new QuickSort().sort(filesBySize, 0, inputFiles.length);
    GlobalIndex<Partition>[] gIndexes = new GlobalIndex[fStatus.length];
    int[] numBlocks = new int[fStatus.length];
    for (int i_file = 0; i_file < fStatus.length; i_file++) {
        gIndexes[i_file] = SpatialSite.getGlobalIndex(outFs, fStatus[i_file].getPath());
        if (gIndexes[i_file] != null) {
            // Number of blocks is equal to number of partitions in global
            // index
            numBlocks[i_file] = gIndexes[i_file].size();
        } else if (fStatus[i_file].isDir()) {
            // Add up number of file system blocks in all subfiles of this
            // directory
            numBlocks[i_file] = 0;
            FileStatus[] subfiles = outFs.listStatus(inputFiles[i_file], SpatialSite.NonHiddenFileFilter);
            for (FileStatus subfile : subfiles) {
                numBlocks[i_file] += outFs.getFileBlockLocations(subfile, 0, subfile.getLen()).length;
            }
        } else {
            // Number of file system blocks in input file
            numBlocks[i_file] = outFs.getFileBlockLocations(fStatus[i_file], 0,
                    fStatus[i_file].getLen()).length;
        }
    }

    cost_without_repartition = gIndexes[0] != null && gIndexes[1] != null
            ? GlobalIndex.spatialJoin(gIndexes[0], gIndexes[1], null)
            : (numBlocks[0] * numBlocks[1]);
    // Total cost = Cost of repartition (=== 2 * numBlocks[0]) +
    // cost of join (=== numBlocks[0] + numBlocks[1])
    cost_with_repartition = numBlocks[0] * 3 + numBlocks[1];
    LOG.info("Cost with repartition is estimated to " + cost_with_repartition);
    LOG.info("Cost without repartition is estimated to " + cost_without_repartition);
    boolean need_repartition = cost_with_repartition < cost_without_repartition;
    if (need_repartition) {
        int file_to_repartition = selectRepartition(inputFiles, params);
        repartitionStep(inputFiles, file_to_repartition, params);
    }

    // Restore inputFiles to the original order by user
    if (inputFiles[1] != originalInputFiles[1]) {
        Path temp = inputFiles[0];
        inputFiles[0] = inputFiles[1];
        inputFiles[1] = temp;
    }

    // Redistribute join the larger file and the partitioned file
    long result_size = DistributedJoin.joinStep(inputFiles, outputPath, params);

    if (userOutputPath == null)
        outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.umn.cs.spatialHadoop.operations.Equals.java

License:Open Source License

public static <S extends Shape> long equals(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Equals.class);

    LOG.info("Equals journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   w  w w . ja va  2 s.  c o m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Equals");
    job.setMapperClass(EqualsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(EqualsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java

License:Open Source License

public static Partition fileMBRLocal(Path[] inFiles, final OperationsParams params)
        throws IOException, InterruptedException {
    // 1- Split the input path/file to get splits that can be processed independently
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    Job job = Job.getInstance(params);//  www  .  ja  v  a  2s .c om
    SpatialInputFormat3.setInputPaths(job, inFiles);
    final List<org.apache.hadoop.mapreduce.InputSplit> splits = inputFormat.getSplits(job);
    int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors());

    // 2- Process splits in parallel
    List<Map<String, Partition>> allMbrs = Parallel.forEach(splits.size(),
            new RunnableRange<Map<String, Partition>>() {
                @Override
                public Map<String, Partition> run(int i1, int i2) {
                    Map<String, Partition> mbrs = new HashMap<String, Partition>();
                    for (int i = i1; i < i2; i++) {
                        try {
                            org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits
                                    .get(i);
                            final RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat
                                    .createRecordReader(fsplit, null);
                            if (reader instanceof SpatialRecordReader3) {
                                ((SpatialRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof RTreeRecordReader3) {
                                ((RTreeRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof HDFRecordReader) {
                                ((HDFRecordReader) reader).initialize(fsplit, params);
                            } else {
                                throw new RuntimeException("Unknown record reader");
                            }
                            Partition p = mbrs.get(fsplit.getPath().getName());
                            if (p == null) {
                                p = new Partition();
                                p.filename = fsplit.getPath().getName();
                                p.cellId = p.filename.hashCode();
                                p.size = 0;
                                p.recordCount = 0;
                                p.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
                                mbrs.put(p.filename, p);
                            }
                            Text temp = new Text2();
                            while (reader.nextKeyValue()) {
                                Iterable<Shape> shapes = reader.getCurrentValue();
                                for (Shape s : shapes) {
                                    Rectangle mbr = s.getMBR();
                                    if (mbr != null)
                                        p.expand(mbr);
                                    p.recordCount++;
                                    temp.clear();
                                    s.toText(temp);
                                    p.size += temp.getLength() + 1;
                                }
                            }
                        } catch (IOException e) {
                            throw new RuntimeException(e);
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                    }
                    return mbrs;
                }
            }, parallelism);
    Map<String, Partition> mbrs = allMbrs.remove(allMbrs.size() - 1);
    for (Map<String, Partition> list : allMbrs) {
        for (Partition p1 : list.values()) {
            Partition p2 = mbrs.get(p1.filename);
            if (p2 != null) {
                p2.expand(p1);
            } else {
                mbrs.put(p1.filename, p1);
            }
        }
    }

    // Cache the final result, if needed
    for (Path inFile : inFiles) {
        FileSystem inFs = inFile.getFileSystem(params);
        if (!inFs.getFileStatus(inFile).isDir())
            continue;
        Path gindex_path = new Path(inFile, "_master.heap");
        // Answer has been already cached (may be by another job)
        if (inFs.exists(gindex_path))
            continue;
        FileStatus[] files = inFs.listStatus(inFile, SpatialSite.NonHiddenFileFilter);
        PrintStream wktout = new PrintStream(inFs.create(new Path(inFile, "_heap.wkt"), false));
        PrintStream gout = new PrintStream(inFs.create(gindex_path, false));

        Text text = new Text2();
        for (FileStatus file : files) {
            text.clear();
            Partition p = mbrs.get(file.getPath().getName());
            gout.println(p.toText(text).toString());
            wktout.println(p.toWKT());
        }

        wktout.close();
        gout.close();
    }

    // Return the final answer
    Partition finalResult = new Partition();
    finalResult.size = finalResult.recordCount = 0;
    finalResult.x1 = finalResult.y1 = Double.MAX_VALUE;
    finalResult.x2 = finalResult.y2 = -Double.MAX_VALUE;
    for (Partition p2 : mbrs.values())
        finalResult.expand(p2);
    return finalResult;
}

From source file:edu.umn.cs.spatialHadoop.operations.GeometricPlot.java

License:Open Source License

/**
 * Combines images of different datasets into one image that is displayed
 * to users.//  www.j  av a 2 s.  com
 * This method is called from the web interface to display one image for
 * multiple selected datasets.
 * @param fs The file system that contains the datasets and images
 * @param files Paths to directories which contains the datasets
 * @param includeBoundaries Also plot the indexing boundaries of datasets
 * @return An image that is the combination of all datasets images
 * @throws IOException
 * @throws InterruptedException 
 */
public static BufferedImage combineImages(Configuration conf, Path[] files, boolean includeBoundaries,
        int width, int height) throws IOException, InterruptedException {
    BufferedImage result = null;
    // Retrieve the MBRs of all datasets
    Rectangle allMbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : files) {
        Rectangle mbr = FileMBR.fileMBR(file, new OperationsParams(conf));
        allMbr.expand(mbr);
    }

    // Adjust width and height to maintain aspect ratio
    if ((allMbr.x2 - allMbr.x1) / (allMbr.y2 - allMbr.y1) > (double) width / height) {
        // Fix width and change height
        height = (int) ((allMbr.y2 - allMbr.y1) * width / (allMbr.x2 - allMbr.x1));
    } else {
        width = (int) ((allMbr.x2 - allMbr.x1) * height / (allMbr.y2 - allMbr.y1));
    }
    result = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);

    for (Path file : files) {
        FileSystem fs = file.getFileSystem(conf);
        if (fs.getFileStatus(file).isDir()) {
            // Retrieve the MBR of this dataset
            Rectangle mbr = FileMBR.fileMBR(file, new OperationsParams(conf));
            // Compute the coordinates of this image in the whole picture
            mbr.x1 = (mbr.x1 - allMbr.x1) * width / allMbr.getWidth();
            mbr.x2 = (mbr.x2 - allMbr.x1) * width / allMbr.getWidth();
            mbr.y1 = (mbr.y1 - allMbr.y1) * height / allMbr.getHeight();
            mbr.y2 = (mbr.y2 - allMbr.y1) * height / allMbr.getHeight();
            // Retrieve the image of this dataset
            Path imagePath = new Path(file, "_data.png");
            if (!fs.exists(imagePath))
                throw new RuntimeException("Image " + imagePath + " not ready");
            FSDataInputStream imageFile = fs.open(imagePath);
            BufferedImage image = ImageIO.read(imageFile);
            imageFile.close();
            // Draw the image
            Graphics graphics = result.getGraphics();
            graphics.drawImage(image, (int) mbr.x1, (int) mbr.y1, (int) mbr.getWidth(), (int) mbr.getHeight(),
                    null);
            graphics.dispose();

            if (includeBoundaries) {
                // Plot also the image of the boundaries
                // Retrieve the image of the dataset boundaries
                imagePath = new Path(file, "_partitions.png");
                if (fs.exists(imagePath)) {
                    imageFile = fs.open(imagePath);
                    image = ImageIO.read(imageFile);
                    imageFile.close();
                    // Draw the image
                    graphics = result.getGraphics();
                    graphics.drawImage(image, (int) mbr.x1, (int) mbr.y1, (int) mbr.getWidth(),
                            (int) mbr.getHeight(), null);
                    graphics.dispose();
                }
            }
        }
    }

    return result;
}