List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java
License:Open Source License
@SuppressWarnings("deprecation") public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, OperationsParams params) throws IOException, InterruptedException { String sindex = params.get("sindex"); long blockSize = params.getSize("blocksize"); FileSystem inFs = inFile.getFileSystem(new Configuration()); FileSystem outFs = outFile.getFileSystem(new Configuration()); // Calculate number of partitions in output file if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { // Copy blocksize from source file if it's globally indexed blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize();/*w w w . j a va2 s .c o m*/ } else { // Use default block size for output file system blockSize = outFs.getDefaultBlockSize(); } } // Calculate the dimensions of each partition based on gindex type CellInfo[] cells; if (sindex.equals("grid")) { Rectangle input_mbr = FileMBR.fileMBR(inFile, params); long inFileSize = FileMBR.sizeOfLastProcessedFile; int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outFile, blockSize); GridInfo gridInfo = new GridInfo(input_mbr.x1, input_mbr.y1, input_mbr.x2, input_mbr.y2); gridInfo.calculateCellDimensions(num_partitions); cells = gridInfo.getAllCells(); } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str") || sindex.equals("str+")) { cells = packInRectangles(inFile, outFile, params); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } repartitionLocal(inFile, outFile, cells, params); }
From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java
License:Open Source License
private static int sampleWithRatio(Path[] files, final ResultCollector<? extends TextSerializable> output, OperationsParams params) throws IOException { FileSystem fs = files[0].getFileSystem(params); FileStatus inFStatus = fs.getFileStatus(files[0]); if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) { // Either a directory of file or a large file return sampleMapReduceWithRatio(files, output, params); } else {/*ww w.j a v a 2 s . c o m*/ // A single small file, process it without MapReduce return sampleLocalWithRatio(files, output, params); } }
From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java
License:Open Source License
private static <T extends TextSerializable> int sampleLocalWithRatio(Path[] files, final ResultCollector<T> output, OperationsParams params) throws IOException { long total_size = 0; // TODO handle compressed files // TODO Use a global index to determine the exact size quickly for (Path file : files) { FileSystem fs = file.getFileSystem(params); FileStatus fStatus = fs.getFileStatus(file); if (fStatus.isDir()) { // Go one level deeper for (FileStatus subFStatus : fs.listStatus(file)) { if (!subFStatus.isDir()) total_size += subFStatus.getLen(); }/*from ww w .j a v a 2 s . c o m*/ } else { total_size += fStatus.getLen(); } } sizeOfLastProcessedFile = total_size; float ratio = params.getFloat("ratio", 0.1f); params.setLong("size", (long) (total_size * ratio)); return sampleLocalWithSize(files, output, params); }
From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java
License:Open Source License
private static <T extends TextSerializable> int sampleLocalByCount(Path[] files, ResultCollector<T> output, OperationsParams params) throws IOException { ArrayList<Path> data_files = new ArrayList<Path>(); for (Path file : files) { FileSystem fs = file.getFileSystem(params); if (fs.getFileStatus(file).isDir()) { // Directory, process all data files in this directory (visible files) FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter); for (FileStatus f : fileStatus) { data_files.add(f.getPath()); }//from w w w . ja va2 s .com } else { // File, process this file data_files.add(file); } } files = data_files.toArray(new Path[data_files.size()]); TextSerializable inObj1, outObj1; inObj1 = OperationsParams.getTextSerializable(params, "shape", new Text2()); outObj1 = OperationsParams.getTextSerializable(params, "outshape", new Text2()); // Make the objects final to be able to use in the anonymous inner class final TextSerializable inObj = inObj1; final T outObj = (T) outObj1; ResultCollector<TextSerializable> converter = createConverter(output, inObj, outObj); long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes long total_length = 0; for (int i_file = 0; i_file < files.length; i_file++) { FileSystem fs = files[i_file].getFileSystem(params); files_start_offset[i_file] = total_length; total_length += fs.getFileStatus(files[i_file]).getLen(); } files_start_offset[files.length] = total_length; // Generate offsets to read from and make sure they are ordered to minimize // seeks between different HDFS blocks Random random = new Random(params.getLong("seed", System.currentTimeMillis())); long[] offsets = new long[params.getInt("count", 0)]; for (int i = 0; i < offsets.length; i++) { if (total_length == 0) offsets[i] = 0; else offsets[i] = Math.abs(random.nextLong()) % total_length; } Arrays.sort(offsets); int record_i = 0; // Number of records read so far int records_returned = 0; int file_i = 0; // Index of the current file being sampled while (record_i < offsets.length) { // Skip to the file that contains the next sample while (offsets[record_i] > files_start_offset[file_i + 1]) file_i++; long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i]; FileSystem fs = files[file_i].getFileSystem(params); ShapeLineRecordReader reader = new ShapeLineRecordReader(fs.getConf(), new FileSplit(files[file_i], 0, current_file_size, new String[] {})); Rectangle key = reader.createKey(); Text line = reader.createValue(); long pos = files_start_offset[file_i]; while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1] && reader.next(key, line)) { pos += line.getLength(); if (pos > offsets[record_i]) { // Passed the offset of record_i // Report this element to output if (converter != null) { inObj.fromText(line); converter.collect(inObj); } record_i++; records_returned++; } } reader.close(); // Skip any remaining records that were supposed to be read from this file // This case might happen if a generated random position was in the middle // of the last line. while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1]) record_i++; } return records_returned; }
From source file:edu.umn.cs.spatialHadoop.operations.SJMR.java
License:Open Source License
public static <S extends Shape> long sjmr(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, SJMR.class); LOG.info("SJMR journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from w w w. jav a 2s . co m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("SJMR"); job.setMapperClass(SJMRMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(SJMRReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Touches.java
License:Open Source License
public static <S extends Shape> long touches(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Touches.class); LOG.info("Touches journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {//from w w w . j av a 2s. co m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Touches"); job.setMapperClass(TouchesMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(TouchesReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Within.java
License:Open Source License
public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Within.class); LOG.info("Within journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from w w w . j a va2 s.c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Within"); job.setMapperClass(WithinMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(WithinReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.osm.OSMToKML.java
License:Open Source License
/** * @param args//w ww . j av a 2s.c o m * @throws IOException */ public static void main(String[] args) throws IOException { final OperationsParams params = new OperationsParams(new GenericOptionsParser(args), false); if (!params.checkInputOutput()) { System.err.println("Please specify input and output"); System.exit(1); } params.setClass("shape", OSMPolygon.class, Shape.class); Path inputPath = params.getInputPath(); FileSystem inFs = inputPath.getFileSystem(params); ShapeArrayRecordReader in = new ShapeArrayRecordReader(params, new FileSplit(inputPath, 0, inFs.getFileStatus(inputPath).getLen(), new String[0])); Path outPath = params.getOutputPath(); FileSystem outFs = outPath.getFileSystem(params); PrintWriter out; ZipOutputStream zipOut = null; if (outPath.getName().toLowerCase().endsWith(".kmz")) { // Create a KMZ file FSDataOutputStream kmzOut = outFs.create(outPath); zipOut = new ZipOutputStream(kmzOut); zipOut.putNextEntry(new ZipEntry("osm.kml")); out = new PrintWriter(zipOut); } else { out = new PrintWriter(outFs.create(outPath)); } out.println("<?xml version='1.0' encoding='UTF-8'?>"); out.println("<kml xmlns='http://www.opengis.net/kml/2.2'>"); out.println("<Document>"); writeAllStyles(out); Rectangle key = in.createKey(); ArrayWritable values = in.createValue(); while (in.next(key, values)) { System.out.println("Read " + values.get().length); for (Shape shape : (Shape[]) values.get()) { if (shape instanceof OSMPolygon) { out.println(OSMtoKMLElement((OSMPolygon) shape)); } } out.println(); } out.println("</Document>"); out.println("</kml>"); in.close(); if (zipOut != null) { // KMZ file out.flush(); zipOut.closeEntry(); zipOut.close(); } else { // KML file out.close(); } }
From source file:edu.umn.cs.spatialHadoop.ReadFile.java
License:Open Source License
public static void main(String[] args) throws Exception { OperationsParams cla = new OperationsParams(new GenericOptionsParser(args)); Path input = cla.getPath();//from ww w . j a va 2s . c om if (input == null) { printUsage(); throw new RuntimeException("Illegal parameters"); } Configuration conf = new Configuration(); Path inFile = new Path(args[0]); FileSystem fs = inFile.getFileSystem(conf); long length = fs.getFileStatus(inFile).getLen(); GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile); if (gindex == null) { BlockLocation[] locations = cla.getInt("offset", 0) == -1 ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length) : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getInt("offset", 0), 1); System.out.println(locations.length + " heap blocks"); } else { for (Partition p : gindex) { long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen(); System.out.println(p + " --- " + partition_length); } } }
From source file:edu.umn.cs.spatialHadoop.util.FileUtil.java
License:Open Source License
/** * Copies a part of a file from a remote file system (e.g., HDFS) to a local * file. Returns a path to a local temporary file. * /*from w w w .ja v a 2 s.c o m*/ * @param conf * @param split * @return * @throws IOException */ public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(conf); // Special case of a local file. Skip copying the file if (fs instanceof LocalFileSystem && split.getStart() == 0) return split.getPath().toUri().getPath(); File destFile = File.createTempFile(split.getPath().getName(), "tmp"); // Special handling for HTTP files for more efficiency /*if (fs instanceof HTTPFileSystem && split.getStart() == 0) { URL website = split.getPath().toUri().toURL(); ReadableByteChannel rbc = Channels.newChannel(website.openStream()); FileOutputStream fos = new FileOutputStream(destFile); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); fos.close(); return destFile.getAbsolutePath(); }*/ // Length of input file. We do not depend on split.length because it is // not // set by input format for performance reason. Setting it in the input // format would cost a lot of time because it runs on the client machine // while the record reader runs on slave nodes in parallel long length = fs.getFileStatus(split.getPath()).getLen(); FSDataInputStream in = fs.open(split.getPath()); in.seek(split.getStart()); ReadableByteChannel rbc = Channels.newChannel(in); // Prepare output file for write FileOutputStream out = new FileOutputStream(destFile); out.getChannel().transferFrom(rbc, 0, length); in.close(); out.close(); return destFile.getAbsolutePath(); }