Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.redsqirl.workflow.server.Workflow.java

License:Open Source License

/**
 * Clean the backup directory//from  www .  ja v  a  2  s.c  o  m
 * 
 * @throws IOException
 */
public void cleanUpBackup() throws IOException {
    String path = WorkflowPrefManager.getBackupPath();
    int nbBackup = WorkflowPrefManager.getNbBackup();

    FileSystem fs = NameNodeVar.getFS();
    // FileStatus stat = fs.getFileStatus(new Path(path));
    FileStatus[] fsA = fs.listStatus(new Path(path), new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return arg0.getName().matches(".*[0-9]{14}(.rs|.srs)$");
        }
    });
    logger.debug("Backup directory: " + fsA.length + " files, " + nbBackup + " to keep, "
            + Math.max(0, fsA.length - nbBackup) + " to remove");
    if (fsA.length > nbBackup) {
        int numberToRemove = fsA.length - nbBackup;
        Map<Path, Long> pathToRemove = new HashMap<Path, Long>();
        Path pathMin = null;
        Long min = Long.MAX_VALUE;
        for (FileStatus stat : fsA) {
            if (pathToRemove.size() < numberToRemove) {
                pathToRemove.put(stat.getPath(), stat.getModificationTime());
            } else if (min > stat.getModificationTime()) {
                pathToRemove.remove(pathMin);
                pathToRemove.put(stat.getPath(), stat.getModificationTime());
            }
            if (min > stat.getModificationTime()) {
                min = stat.getModificationTime();
                pathMin = stat.getPath();
            }

        }
        for (Path pathDel : pathToRemove.keySet()) {
            fs.delete(pathDel, false);
        }
    }
    // fs.close();
}

From source file:com.ricemap.spateDB.core.SpatialSite.java

License:Apache License

public static GlobalIndex<Partition> getGlobalIndex(FileSystem fs, Path dir) throws IOException {
    // Retrieve the master file (the only file with the name _master in it)
    FileStatus[] masterFiles = fs.listStatus(dir, new PathFilter() {
        @Override//from  w w  w  . j a v a 2  s . c o m
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    // Check if the given file is indexed
    if (masterFiles.length == 0)
        return null;
    if (masterFiles.length > 1)
        throw new RuntimeException("Found more than one master file in " + dir);
    Path masterFile = masterFiles[0].getPath();
    ShapeRecordReader<Partition> reader = new ShapeRecordReader<Partition>(fs.open(masterFile), 0,
            fs.getFileStatus(masterFile).getLen());
    CellInfo dummy = new CellInfo();
    Partition partition = new Partition();
    ArrayList<Partition> partitions = new ArrayList<Partition>();
    while (reader.next(dummy, partition)) {
        partitions.add(partition.clone());
    }
    GlobalIndex<Partition> globalIndex = new GlobalIndex<Partition>();
    globalIndex.bulkLoad(partitions.toArray(new Partition[partitions.size()]));
    globalIndex.setCompact(masterFile.getName().endsWith("rtree") || masterFile.getName().endsWith("r+tree"));
    globalIndex.setReplicated(masterFile.getName().endsWith("r+tree") || masterFile.getName().endsWith("grid"));
    return globalIndex;
}

From source file:com.ricemap.spateDB.mapred.SpatialInputFormat.java

License:Apache License

protected void listStatus(final FileSystem fs, final Path dir, final List<FileStatus> result,
        BlockFilter filter) throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    FileStatus[] listStatus = fs.listStatus(dir, hiddenFileFilter);
    if (gindex == null) {
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else {
                result.add(status);/*from  w  w w .j a  va2s  .c o m*/
            }
        }
    } else {
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(dir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Reads a sample of the given file and returns the number of items read.
 * //from  www. j av  a2 s  . c om
 * @param fs
 * @param file
 * @param count
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs,
        Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException {
    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    ResultCollector<T> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(seed);
    long[] offsets = new long[count];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < count) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        // Open a stream to the current file and use it to read all samples
        // in this file
        FSDataInputStream current_file_in = fs.open(files[file_i]);
        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];

        // The start and end offsets of data within this block
        // offsets are calculated relative to file start
        long data_start_offset = 0;
        if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) {
            // This file is an RTree file. Update the start offset to point
            // to the first byte after the header
            data_start_offset = 8 + RTree.getHeaderSize(current_file_in);
        }
        // Get the end offset of data by searching for the beginning of the
        // last line
        long data_end_offset = current_file_size;
        // Skip the last line too to ensure to ensure that the mapped position
        // will be before some line in the block
        current_file_in.seek(data_end_offset);
        data_end_offset = Tail.tail(current_file_in, 1, null, null);
        long file_data_size = data_end_offset - data_start_offset;

        // Keep sampling as long as records offsets are within this file
        while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) {
            offsets[record_i] -= files_start_offset[file_i];
            // Map file position to element index in this tree assuming fixed
            // size records
            long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size
                    + data_start_offset;
            current_file_in.seek(element_offset_in_file);
            LineReader reader = new LineReader(current_file_in, 4096);
            // Read the first line after that offset
            Text line = new Text();
            reader.readLine(line); // Skip the rest of the current line
            reader.readLine(line); // Read next line

            // Report this element to output
            if (converter != null) {
                inObj.fromText(line);
                converter.collect(inObj);
            }
            record_i++;
            records_returned++;
        }
        current_file_in.close();
    }
    return records_returned;
}

From source file:com.ricemap.spateDB.util.RandomSpatialGenerator.java

License:Apache License

public static void generateMapReduce(Path file, Prism mbr, long size, long blocksize, Shape shape,
        String sindex, long seed, int rectsize, RandomShapeGenerator.DistributionType type, boolean overwrite)
        throws IOException {
    JobConf job = new JobConf(RandomSpatialGenerator.class);

    job.setJobName("Generator");
    FileSystem outFs = file.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(file)) {
        if (overwrite)
            outFs.delete(file, true);/*from   w w w .  j a  v a  2s.co  m*/
        else
            throw new RuntimeException(
                    "Output file '" + file + "' already exists and overwrite flag is not set");
    }

    // Set generation parameters in job
    job.setLong(RandomShapeGenerator.GenerationSize, size);
    SpatialSite.setPrism(job, RandomShapeGenerator.GenerationMBR, mbr);
    if (seed != 0)
        job.setLong(RandomShapeGenerator.GenerationSeed, seed);
    if (rectsize != 0)
        job.setInt(RandomShapeGenerator.GenerationRectSize, rectsize);
    if (type != null)
        job.set(RandomShapeGenerator.GenerationType, type.toString());

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    // Set input format and map class
    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Repartition.RepartitionMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(shape.getClass());
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setShapeClass(job, shape.getClass());

    if (blocksize != 0) {
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blocksize);
    }

    CellInfo[] cells;
    if (sindex == null) {
        cells = new CellInfo[] { new CellInfo(1, mbr) };
    } else if (sindex.equals("grid")) {
        GridInfo gridInfo = new GridInfo(mbr.t1, mbr.x1, mbr.y1, mbr.t2, mbr.x2, mbr.y2);
        FileSystem fs = file.getFileSystem(job);
        if (blocksize == 0) {
            blocksize = fs.getDefaultBlockSize(file);
        }
        int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, file, blocksize);
        gridInfo.calculateCellDimensions(numOfCells);
        cells = gridInfo.getAllCells();
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cells);

    // Do not set a reduce function. Use the default identity reduce function
    if (cells.length == 1) {
        // All objects are in one partition. No need for a reduce phase
        job.setNumReduceTasks(0);
    } else {
        // More than one partition. Need a reduce phase to group shapes of the
        // same partition together
        job.setReducerClass(RepartitionReduce.class);
        job.setNumReduceTasks(
                Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, file);
    if (sindex == null || sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobClient.runJob(job);

    // Concatenate all master files into one file
    FileStatus[] resultFiles = outFs.listStatus(file, new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("_master");
        }
    });
    String ext = resultFiles[0].getPath().getName()
            .substring(resultFiles[0].getPath().getName().lastIndexOf('.'));
    Path masterPath = new Path(file, "_master" + ext);
    OutputStream destOut = outFs.create(masterPath);
    byte[] buffer = new byte[4096];
    for (FileStatus f : resultFiles) {
        InputStream in = outFs.open(f.getPath());
        int bytes_read;
        do {
            bytes_read = in.read(buffer);
            if (bytes_read > 0)
                destOut.write(buffer, 0, bytes_read);
        } while (bytes_read > 0);
        in.close();
        outFs.delete(f.getPath(), false);
    }
    destOut.close();

    // Plot an image for the partitions used in file
    Path imagePath = new Path(file, "_partitions.png");
    int imageSize = (int) (Math.sqrt(cells.length) * 300);
    Plot.plotLocal(masterPath, imagePath, new Partition(), imageSize, imageSize, Color.BLACK, false, false,
            false);
}

From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr1.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();

    // Explode out directories that match the original FileInputFormat
    // filters since HFiles are written to directories where the
    // directory name is the column name
    for (FileStatus status : super.listStatus(job)) {
        if (status.isDirectory()) {
            FileSystem fs = status.getPath().getFileSystem(job.getConfiguration());
            for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) {
                result.add(match);// w w w .ja v  a  2  s. co  m
            }
        } else {
            result.add(status);
        }
    }

    return result;
}

From source file:com.run.mapred.hbase2tsv.HFileInputFormat_mr2.java

License:Apache License

@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();

    // Explode out directories that match the original FileInputFormat
    // filters since HFiles are written to directories where the
    // directory name is the column name
    for (FileStatus status : super.listStatus(job)) {
        if (status.isDirectory()) {
            FileSystem fs = status.getPath().getFileSystem(job);
            for (FileStatus match : fs.listStatus(status.getPath(), HIDDEN_FILE_FILTER)) {
                result.add(match);/*from www  . j  av a  2 s . c om*/
            }
        } else {
            result.add(status);
        }
    }

    return (FileStatus[]) result.toArray();
}

From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java

License:Apache License

private int getFinalFileNameCount(FileSystem fs, Path dir, final String prefix) throws IOException {
    return fs.listStatus(dir, new PathFilter() {
        @Override/*ww  w .  j ava  2  s . co m*/
        public boolean accept(Path path) {
            return path.getName().startsWith(prefix);
        }
    }).length;
}

From source file:com.TCG.Nutch_DNS.HostDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println(// w  w  w  .  j  a v  a 2  s. c o m
                "Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
        System.err.println("\tcrawldb\tCrawlDb to update");
        System.err.println("\t-dir segments\tparent directory containing all segments to update from");
        System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
        System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
        System.err
                .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
        System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
        System.err.println(
                "\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");

        return -1;
    }
    boolean normalize = getConf().getBoolean(HostDbFilter.URL_NORMALIZING, false);
    boolean filter = getConf().getBoolean(HostDbFilter.URL_FILTERING, false);
    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
    boolean force = false;
    final FileSystem fs = FileSystem.get(getConf());
    HashSet<Path> dirs = new HashSet<Path>();
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-normalize")) {
            normalize = true;
        } else if (args[i].equals("-filter")) {
            filter = true;
        } else if (args[i].equals("-force")) {
            force = true;
        } else if (args[i].equals("-noAdditions")) {
            additionsAllowed = false;
        } else if (args[i].equals("-dir")) {
            FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
            dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
        } else {
            dirs.add(new Path(args[i]));
        }
    }
    try {
        update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed,
                force);
        return 0;
    } catch (Exception e) {
        LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:com.twitter.algebra.matrix.format.MapDir.java

License:Apache License

/**
 * [Re-]Initialize the MapDir with new directory of MapFiles. {@link #close()}
 * must be called before if MapDir is already initialized.
 * //from w ww. jav a2s . c o m
 * @param dir
 * @param conf
 * @return
 * @throws IOException
 */
@SuppressWarnings("deprecation")
public MapFile.Reader[] getReaders(Path dir, Configuration conf) throws IOException {
    FileSystem fs = dir.getFileSystem(conf);
    Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, mapFilter));

    partitioner = new TitlePartitioner(names);

    MapFile.Reader[] parts = new MapFile.Reader[names.length];
    for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
    }
    return parts;
}