Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.redsqirl.workflow.server.Workflow.java

License:Open Source License

/**
 * Clean the backup directory//from  www .  ja v  a  2  s.c  o  m
 * 
 * @throws IOException
 */
public void cleanUpBackup() throws IOException {
    String path = WorkflowPrefManager.getBackupPath();
    int nbBackup = WorkflowPrefManager.getNbBackup();

    FileSystem fs = NameNodeVar.getFS();
    // FileStatus stat = fs.getFileStatus(new Path(path));
    FileStatus[] fsA = fs.listStatus(new Path(path), new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return arg0.getName().matches(".*[0-9]{14}(.rs|.srs)$");
        }
    });
    logger.debug("Backup directory: " + fsA.length + " files, " + nbBackup + " to keep, "
            + Math.max(0, fsA.length - nbBackup) + " to remove");
    if (fsA.length > nbBackup) {
        int numberToRemove = fsA.length - nbBackup;
        Map<Path, Long> pathToRemove = new HashMap<Path, Long>();
        Path pathMin = null;
        Long min = Long.MAX_VALUE;
        for (FileStatus stat : fsA) {
            if (pathToRemove.size() < numberToRemove) {
                pathToRemove.put(stat.getPath(), stat.getModificationTime());
            } else if (min > stat.getModificationTime()) {
                pathToRemove.remove(pathMin);
                pathToRemove.put(stat.getPath(), stat.getModificationTime());
            }
            if (min > stat.getModificationTime()) {
                min = stat.getModificationTime();
                pathMin = stat.getPath();
            }

        }
        for (Path pathDel : pathToRemove.keySet()) {
            fs.delete(pathDel, false);
        }
    }
    // fs.close();
}

From source file:com.ricemap.spateDB.core.SpatialSite.java

License:Apache License

public static GlobalIndex<Partition> getGlobalIndex(FileSystem fs, Path dir) throws IOException {
    // Retrieve the master file (the only file with the name _master in it)
    FileStatus[] masterFiles = fs.listStatus(dir, new PathFilter() {
        @Override//from  w w  w  . j a v a 2  s . c o m
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    // Check if the given file is indexed
    if (masterFiles.length == 0)
        return null;
    if (masterFiles.length > 1)
        throw new RuntimeException("Found more than one master file in " + dir);
    Path masterFile = masterFiles[0].getPath();
    ShapeRecordReader<Partition> reader = new ShapeRecordReader<Partition>(fs.open(masterFile), 0,
            fs.getFileStatus(masterFile).getLen());
    CellInfo dummy = new CellInfo();
    Partition partition = new Partition();
    ArrayList<Partition> partitions = new ArrayList<Partition>();
    while (reader.next(dummy, partition)) {
        partitions.add(partition.clone());
    }
    GlobalIndex<Partition> globalIndex = new GlobalIndex<Partition>();
    globalIndex.bulkLoad(partitions.toArray(new Partition[partitions.size()]));
    globalIndex.setCompact(masterFile.getName().endsWith("rtree") || masterFile.getName().endsWith("r+tree"));
    globalIndex.setReplicated(masterFile.getName().endsWith("r+tree") || masterFile.getName().endsWith("grid"));
    return globalIndex;
}

From source file:com.ricemap.spateDB.mapred.SpatialInputFormat.java

License:Apache License

protected void listStatus(final FileSystem fs, final Path dir, final List<FileStatus> result,
        BlockFilter filter) throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    FileStatus[] listStatus = fs.listStatus(dir, hiddenFileFilter);
    if (gindex == null) {
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else {
                result.add(status);/*from  w  w w .j a  va2s  .c o m*/
            }
        }
    } else {
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(dir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Reads a sample of the given file and returns the number of items read.
 * //from  www. j av  a2 s  . c om
 * @param fs
 * @param file
 * @param count
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs,
        Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException {
    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    ResultCollector<T> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(seed);
    long[] offsets = new long[count];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < count) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        // Open a stream to the current file and use it to read all samples
        // in this file
        FSDataInputStream current_file_in = fs.open(files[file_i]);
        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];

        // The start and end offsets of data within this block
        // offsets are calculated relative to file start
        long data_start_offset = 0;
        if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) {
            // This file is an RTree file. Update the start offset to point
            // to the first byte after the header
            data_start_offset = 8 + RTree.getHeaderSize(current_file_in);
        }
        // Get the end offset of data by searching for the beginning of the
        // last line
        long data_end_offset = current_file_size;
        // Skip the last line too to ensure to ensure that the mapped position
        // will be before some line in the block
        current_file_in.seek(data_end_offset);
        data_end_offset = Tail.tail(current_file_in, 1, null, null);
        long file_data_size = data_end_offset - data_start_offset;

        // Keep sampling as long as records offsets are within this file
        while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) {
            offsets[record_i] -= files_start_offset[file_i];
            // Map file position to element index in this tree assuming fixed
            // size records
            long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size
                    + data_start_offset;
            current_file_in.seek(element_offset_in_file);
            LineReader reader = new LineReader(current_file_in, 4096);
            // Read the first line after that offset
            Text line = new Text();
            reader.readLine(line); // Skip the rest of the current line
            reader.readLine(line); // Read next line

            // Report this element to output
            if (converter != null) {
                inObj.fromText(line);
                converter.collect(inObj);
            }
            record_i++;
            records_returned++;
        }
        current_file_in.close();
    }
    return records_returned;
}

From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java

License:Apache License

public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape,
        String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite)
        throws IOException {
    JobConf job = new JobConf(RandomSpatialGenerator.class);

    job.setJobName("Generator");
    FileSystem outFs = file.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(file)) {
        if (overwrite)
            outFs.delete(file, true);/*from   w w w .  j a  v a  2s.co  m*/
        else
            throw new RuntimeException(
                    "Output file '" + file + "' already exists and overwrite flag is not set");
    }

    // Set generation parameters in job
    job.setLong(RandomShapeGenerator.GenerationSize, size);
    SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr);
    if (seed != 0)
        job.setLong(RandomShapeGenerator.GenerationSeed, seed);
    if (rectsize != 0)
        job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize);
    if (type != null)
        job.set(RandomShapeGenerator.GenerationType, type.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    // Set input format and map class
    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Repartition.RepartitionMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setShapeClass(job, shape.getClass());

    if (blocksize != 0) {
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize);
    }

    CellInfo[] cells;
    if (sindex == null) {
        cells = new CellInfo[] { new CellInfo(1, mbr) };
    } else if (sindex.equals("grid")) {
        GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2);
        FileSystem fs = file.getFileSystem(job);
        if (blocksize == 0) {
            blocksize = fs.getDefaultBlockSize(file);
        }
        int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize);
        gridInfo.calculateCellDimensions(numOfCells);
        cells = gridInfo.getAllCells();
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cells);

    // Do not set a reduce function. Use the default identity reduce function
    if (cells.length == 1) {
        // All objects are in one partition. No need for a reduce phase
        job.setNumReduceTasks(0);
    } else {
        // More than one partition. Need a reduce phase to group shapes of the
        // same partition together
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, file);
    if (sindex == null || sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobClient.runJob(job);

    // Concatenate all master files into one file
    FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    String ext = resultFiles[0].getPath().getName()
            .substring(resultFiles[0].getPath().getName().lastIndexOf('.'));
    Path masterPath = new Path(file, "_master" + ext);
    OutputStream destOut = outFs.create(masterPath);
    byte[] buffer = new byte[4096];
    for (FileStatus f : resultFiles) {
        InputStream in = outFs.open(f.getPath());
        int bytes_read;
        do {
            bytes_read = in.read(buffer);
            if (bytes_read > 0)
                destOut.write(buffer, 0, bytes_read);
        } while (bytes_read > 0);
        in.close();
        outFs.delete(f.getPath(), false);
    }
    destOut.close();

    // Plot an image for the partitions used in file
    Path imagePath = new Path(file, "_partitions.png");
    int imageSize = (int) (Math.sqrt(cells.length) * 300);
    Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false,
            false);
}

From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr1.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();

    // Explode out directories that match the original FileInputFormat
    // filters since HFiles are written to directories where the
    // directory name is the column name
    for (FileStatus status : super.listStatus(job)) {
        if (status.isDirectory()) {
            FileSystem fs = status.getPath().getFileSystem(job.getConfiguration());
            for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) {
                result.add(match);// w w w .ja v  a  2  s. co  m
            }
        } else {
            result.add(status);
        }
    }

    return result;
}

From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr2.java

License:Apache License

@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();

    // Explode out directories that match the original FileInputFormat
    // filters since HFiles are written to directories where the
    // directory name is the column name
    for (FileStatus status : super.listStatus(job)) {
        if (status.isDirectory()) {
            FileSystem fs = status.getPath().getFileSystem(job);
            for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) {
                result.add(match);/*from www  . j  av a  2 s . c om*/
            }
        } else {
            result.add(status);
        }
    }

    return (FileStatus[]) result.toArray();
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private int getFinalFileNameCount(FileSystem fs, Path dir, final String prefix) throws IOException {
    return fs.listStatus(dir, new PathFilter() {
        @Override/*ww  w .  j ava  2  s . co m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(prefix);
        }
    }).length;
}

From source file:com.TCG.Nutch_DNS.HostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println(// w  w  w  .  j  a v  a 2  s. c o m
                "Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
        System.err.println("\tcrawldb\tCrawlDb to update");
        System.err.println("\t-dir segments\tparent directory containing all segments to update from");
        System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
        System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
        System.err
                .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
        System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
        System.err.println(
                "\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");

        return -1;
    }
    boolean normalize = getConf().getBoolean(HostDbFilter.URL_NORMALIZING, false);
    boolean filter = getConf().getBoolean(HostDbFilter.URL_FILTERING, false);
    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
    boolean force = false;
    final FileSystem fs = FileSystem.get(getConf());
    HashSet<Path> dirs = new HashSet<Path>();
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-normalize")) {
            normalize = true;
        } else if (args[i].equals("-filter")) {
            filter = true;
        } else if (args[i].equals("-force")) {
            force = true;
        } else if (args[i].equals("-noAdditions")) {
            additionsAllowed = false;
        } else if (args[i].equals("-dir")) {
            FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
            dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
        } else {
            dirs.add(new Path(args[i]));
        }
    }
    try {
        update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed,
                force);
        return 0;
    } catch (Exception e) {
        LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

/**
 * [Re-]Initialize the MapDir with new directory of MapFiles. {@link #close()}
 * must be called before if MapDir is already initialized.
 * //from w ww. jav a2s . c o m
 * @param dir
 * @param conf
 * @return
 * @throws IOException
 */
@SuppressWarnings("deprecation")
public MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, mapFilter));

    partitioner = new TitlePartitioner(names);

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}