List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.redsqirl.workflow.server.Workflow.java
License:Open Source License
/** * Clean the backup directory//from www . ja v a 2 s.c o m * * @throws IOException */ public void cleanUpBackup() throws IOException { String path = WorkflowPrefManager.getBackupPath(); int nbBackup = WorkflowPrefManager.getNbBackup(); FileSystem fs = NameNodeVar.getFS(); // FileStatus stat = fs.getFileStatus(new Path(path)); FileStatus[] fsA = fs.listStatus(new Path(path), new PathFilter() { @Override public boolean accept(Path arg0) { return arg0.getName().matches(".*[0-9]{14}(.rs|.srs)$"); } }); logger.debug("Backup directory: " + fsA.length + " files, " + nbBackup + " to keep, " + Math.max(0, fsA.length - nbBackup) + " to remove"); if (fsA.length > nbBackup) { int numberToRemove = fsA.length - nbBackup; Map<Path, Long> pathToRemove = new HashMap<Path, Long>(); Path pathMin = null; Long min = Long.MAX_VALUE; for (FileStatus stat : fsA) { if (pathToRemove.size() < numberToRemove) { pathToRemove.put(stat.getPath(), stat.getModificationTime()); } else if (min > stat.getModificationTime()) { pathToRemove.remove(pathMin); pathToRemove.put(stat.getPath(), stat.getModificationTime()); } if (min > stat.getModificationTime()) { min = stat.getModificationTime(); pathMin = stat.getPath(); } } for (Path pathDel : pathToRemove.keySet()) { fs.delete(pathDel, false); } } // fs.close(); }
From source file:com.ricemap.spateDB.core.SpatialSite.java
License:Apache License
public static GlobalIndex<Partition> getGlobalIndex(FileSystem fs, Path dir) throws IOException { // Retrieve the master file (the only file with the name _master in it) FileStatus[] masterFiles = fs.listStatus(dir, new PathFilter() { @Override//from w w w . j a v a 2 s . c o m public boolean accept(Path path) { return path.getName().contains("_master"); } }); // Check if the given file is indexed if (masterFiles.length == 0) return null; if (masterFiles.length > 1) throw new RuntimeException("Found more than one master file in " + dir); Path masterFile = masterFiles[0].getPath(); ShapeRecordReader<Partition> reader = new ShapeRecordReader<Partition>(fs.open(masterFile), 0, fs.getFileStatus(masterFile).getLen()); CellInfo dummy = new CellInfo(); Partition partition = new Partition(); ArrayList<Partition> partitions = new ArrayList<Partition>(); while (reader.next(dummy, partition)) { partitions.add(partition.clone()); } GlobalIndex<Partition> globalIndex = new GlobalIndex<Partition>(); globalIndex.bulkLoad(partitions.toArray(new Partition[partitions.size()])); globalIndex.setCompact(masterFile.getName().endsWith("rtree") || masterFile.getName().endsWith("r+tree")); globalIndex.setReplicated(masterFile.getName().endsWith("r+tree") || masterFile.getName().endsWith("grid")); return globalIndex; }
From source file:com.ricemap.spateDB.mapred.SpatialInputFormat.java
License:Apache License
protected void listStatus(final FileSystem fs, final Path dir, final List<FileStatus> result, BlockFilter filter) throws IOException { GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir); FileStatus[] listStatus = fs.listStatus(dir, hiddenFileFilter); if (gindex == null) { // Add all files under this directory for (FileStatus status : listStatus) { if (status.isDir()) { listStatus(fs, status.getPath(), result, filter); } else { result.add(status);/*from w w w .j a va2s .c o m*/ } } } else { // Use the global index to limit files filter.selectCells(gindex, new ResultCollector<Partition>() { @Override public void collect(Partition partition) { try { Path cell_path = new Path(dir, partition.filename); if (!fs.exists(cell_path)) LOG.warn("Matched file not found: " + cell_path); result.add(fs.getFileStatus(cell_path)); } catch (IOException e) { e.printStackTrace(); } } }); } }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Reads a sample of the given file and returns the number of items read. * //from www. j av a2 s . c om * @param fs * @param file * @param count * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs, Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException { ArrayList<Path> data_files = new ArrayList<Path>(); for (Path file : files) { if (fs.getFileStatus(file).isDir()) { // Directory, process all data files in this directory (visible files) FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter); for (FileStatus f : fileStatus) { data_files.add(f.getPath()); } } else { // File, process this file data_files.add(file); } } files = data_files.toArray(new Path[data_files.size()]); ResultCollector<T> converter = createConverter(output, inObj, outObj); long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes long total_length = 0; for (int i_file = 0; i_file < files.length; i_file++) { files_start_offset[i_file] = total_length; total_length += fs.getFileStatus(files[i_file]).getLen(); } files_start_offset[files.length] = total_length; // Generate offsets to read from and make sure they are ordered to minimize // seeks between different HDFS blocks Random random = new Random(seed); long[] offsets = new long[count]; for (int i = 0; i < offsets.length; i++) { if (total_length == 0) offsets[i] = 0; else offsets[i] = Math.abs(random.nextLong()) % total_length; } Arrays.sort(offsets); int record_i = 0; // Number of records read so far int records_returned = 0; int file_i = 0; // Index of the current file being sampled while (record_i < count) { // Skip to the file that contains the next sample while (offsets[record_i] > files_start_offset[file_i + 1]) file_i++; // Open a stream to the current file and use it to read all samples // in this file FSDataInputStream current_file_in = fs.open(files[file_i]); long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i]; // The start and end offsets of data within this block // offsets are calculated relative to file start long data_start_offset = 0; if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) { // This file is an RTree file. Update the start offset to point // to the first byte after the header data_start_offset = 8 + RTree.getHeaderSize(current_file_in); } // Get the end offset of data by searching for the beginning of the // last line long data_end_offset = current_file_size; // Skip the last line too to ensure to ensure that the mapped position // will be before some line in the block current_file_in.seek(data_end_offset); data_end_offset = Tail.tail(current_file_in, 1, null, null); long file_data_size = data_end_offset - data_start_offset; // Keep sampling as long as records offsets are within this file while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) { offsets[record_i] -= files_start_offset[file_i]; // Map file position to element index in this tree assuming fixed // size records long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size + data_start_offset; current_file_in.seek(element_offset_in_file); LineReader reader = new LineReader(current_file_in, 4096); // Read the first line after that offset Text line = new Text(); reader.readLine(line); // Skip the rest of the current line reader.readLine(line); // Read next line // Report this element to output if (converter != null) { inObj.fromText(line); converter.collect(inObj); } record_i++; records_returned++; } current_file_in.close(); } return records_returned; }
From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java
License:Apache License
public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape, String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite) throws IOException { JobConf job = new JobConf(RandomSpatialGenerator.class); job.setJobName("Generator"); FileSystem outFs = file.getFileSystem(job); // Overwrite output file if (outFs.exists(file)) { if (overwrite) outFs.delete(file, true);/*from w w w . j a v a 2s.co m*/ else throw new RuntimeException( "Output file '" + file + "' already exists and overwrite flag is not set"); } // Set generation parameters in job job.setLong(RandomShapeGenerator.GenerationSize, size); SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr); if (seed != 0) job.setLong(RandomShapeGenerator.GenerationSeed, seed); if (rectsize != 0) job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize); if (type != null) job.set(RandomShapeGenerator.GenerationType, type.toString()); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); // Set input format and map class job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Repartition.RepartitionMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); SpatialSite.setShapeClass(job, shape.getClass()); if (blocksize != 0) { job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize); } CellInfo[] cells; if (sindex == null) { cells = new CellInfo[] { new CellInfo(1, mbr) }; } else if (sindex.equals("grid")) { GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2); FileSystem fs = file.getFileSystem(job); if (blocksize == 0) { blocksize = fs.getDefaultBlockSize(file); } int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize); gridInfo.calculateCellDimensions(numOfCells); cells = gridInfo.getAllCells(); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } SpatialSite.setCells(job, cells); // Do not set a reduce function. Use the default identity reduce function if (cells.length == 1) { // All objects are in one partition. No need for a reduce phase job.setNumReduceTasks(0); } else { // More than one partition. Need a reduce phase to group shapes of the // same partition together job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); } // Set output path FileOutputFormat.setOutputPath(job, file); if (sindex == null || sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } JobClient.runJob(job); // Concatenate all master files into one file FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains("_master"); } }); String ext = resultFiles[0].getPath().getName() .substring(resultFiles[0].getPath().getName().lastIndexOf('.')); Path masterPath = new Path(file, "_master" + ext); OutputStream destOut = outFs.create(masterPath); byte[] buffer = new byte[4096]; for (FileStatus f : resultFiles) { InputStream in = outFs.open(f.getPath()); int bytes_read; do { bytes_read = in.read(buffer); if (bytes_read > 0) destOut.write(buffer, 0, bytes_read); } while (bytes_read > 0); in.close(); outFs.delete(f.getPath(), false); } destOut.close(); // Plot an image for the partitions used in file Path imagePath = new Path(file, "_partitions.png"); int imageSize = (int) (Math.sqrt(cells.length) * 300); Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false, false); }
From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr1.java
License:Apache License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); // Explode out directories that match the original FileInputFormat // filters since HFiles are written to directories where the // directory name is the column name for (FileStatus status : super.listStatus(job)) { if (status.isDirectory()) { FileSystem fs = status.getPath().getFileSystem(job.getConfiguration()); for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) { result.add(match);// w w w .ja v a 2 s. co m } } else { result.add(status); } } return result; }
From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr2.java
License:Apache License
@Override protected FileStatus[] listStatus(JobConf job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); // Explode out directories that match the original FileInputFormat // filters since HFiles are written to directories where the // directory name is the column name for (FileStatus status : super.listStatus(job)) { if (status.isDirectory()) { FileSystem fs = status.getPath().getFileSystem(job); for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) { result.add(match);/*from www . j av a 2 s . c om*/ } } else { result.add(status); } } return (FileStatus[]) result.toArray(); }
From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java
License:Apache License
private int getFinalFileNameCount(FileSystem fs, Path dir, final String prefix) throws IOException { return fs.listStatus(dir, new PathFilter() { @Override/*ww w . j ava 2 s . co m*/ public boolean accept(Path path) { return path.getName().startsWith(prefix); } }).length; }
From source file:com.TCG.Nutch_DNS.HostDb.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println(// w w w . j a v a 2 s. c o m "Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]"); System.err.println("\tcrawldb\tCrawlDb to update"); System.err.println("\t-dir segments\tparent directory containing all segments to update from"); System.err.println("\tseg1 seg2 ...\tlist of segment names to update from"); System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)"); System.err .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)"); System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment"); System.err.println( "\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); return -1; } boolean normalize = getConf().getBoolean(HostDbFilter.URL_NORMALIZING, false); boolean filter = getConf().getBoolean(HostDbFilter.URL_FILTERING, false); boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); boolean force = false; final FileSystem fs = FileSystem.get(getConf()); HashSet<Path> dirs = new HashSet<Path>(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-normalize")) { normalize = true; } else if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-force")) { force = true; } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else { dirs.add(new Path(args[i])); } } try { update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force); return 0; } catch (Exception e) { LOG.error("CrawlDb update: " + StringUtils.stringifyException(e)); return -1; } }
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
/** * [Re-]Initialize the MapDir with new directory of MapFiles. {@link #close()} * must be called before if MapDir is already initialized. * //from w ww. jav a2s . c o m * @param dir * @param conf * @return * @throws IOException */ @SuppressWarnings("deprecation") public MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException { FileSystem fs = dir.getFileSystem(conf); Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, mapFilter)); partitioner = new TitlePartitioner(names); MapFile.Reader[] parts = new MapFile.Reader[names.length]; for (int i = 0; i < names.length; i++) { parts[i] = new MapFile.Reader(fs, names[i].toString(), conf); } return parts; }