Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

/**
* Repartitions a file that is already in HDFS. It runs a MapReduce job
* that partitions the file into cells, and writes each cell separately.
* @param conf/*  w w w.ja v  a  2  s.co m*/
* @param inFile
* @param outPath
* @param gridInfo
 * @param stockShape 
* @param pack
* @param rtree
* @param overwrite
* @throws IOException
*/
public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize,
        String sindex, boolean overwrite, boolean columnar) throws IOException {

    FileSystem inFs = inFile.getFileSystem(new Configuration());
    FileSystem outFs = outPath.getFileSystem(new Configuration());

    // Calculate number of partitions in output file
    // Copy blocksize from source file if it's globally indexed
    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();
        } else {
            blockSize = outFs.getDefaultBlockSize(outPath);
        }
    }

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cellInfos;
    if (sindex.equals("grid")) {
        Prism input_mbr = FileMBR.fileMBRMapReduce(inFs, inFile, stockShape, false);
        long inFileSize = FileMBR.sizeOfLastProcessedFile;
        int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outPath,
                blockSize);

        GridInfo gridInfo = new GridInfo(input_mbr.t1, input_mbr.x1, input_mbr.y1, input_mbr.t2, input_mbr.x2,
                input_mbr.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cellInfos = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // Pack in Prisms using an RTree
        cellInfos = packInPrisms(inFs, inFile, outFs, outPath, blockSize, stockShape);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    repartitionMapReduce(inFile, outPath, stockShape, blockSize, cellInfos, sindex, overwrite, columnar);
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

/**
 * Repartitions an input file according to the given list of cells.
 * @param inFile/*www  .  ja va2  s. c o m*/
 * @param outPath
 * @param cellInfos
 * @param pack
 * @param rtree
 * @param overwrite
 * @throws IOException
 */
public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException {
    JobConf job = new JobConf(Repartition.class);
    job.setJobName("Repartition");
    FileSystem outFs = outPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);
    boolean pack = sindex.equals("r+tree");
    boolean expand = sindex.equals("rtree");
    job.setBoolean(SpatialSite.PACK_CELLS, pack);
    job.setBoolean(SpatialSite.EXPAND_CELLS, expand);
    job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal");

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    // Set default parameters for reading input file
    SpatialSite.setShapeClass(job, stockShape.getClass());

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }
    // Copy block size from source file if it's globally indexed
    FileSystem inFs = inFile.getFileSystem(job);

    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();
            LOG.info("Automatically setting block size to " + blockSize);
        }
    }

    if (blockSize != 0)
        job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize);
    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, S stockShape, long blockSize,
        String sindex, boolean overwrite) throws IOException {

    FileSystem inFs = inFile.getFileSystem(new Configuration());
    FileSystem outFs = outFile.getFileSystem(new Configuration());

    // Calculate number of partitions in output file
    // Copy blocksize from source file if it's globally indexed
    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename))
                    .getBlockSize();//from w  w w  .  j ava 2  s.c o  m
        }
    }

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cellInfos;
    if (sindex.equals("grid")) {
        Prism input_mbr = FileMBR.fileMBRLocal(inFs, inFile, stockShape);
        long inFileSize = FileMBR.sizeOfLastProcessedFile;
        int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outFile,
                blockSize);

        GridInfo gridInfo = new GridInfo(input_mbr.t1, input_mbr.x1, input_mbr.y1, input_mbr.t2, input_mbr.x2,
                input_mbr.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cellInfos = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        cellInfos = packInPrisms(inFs, inFile, outFs, outFile, blockSize, stockShape);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    repartitionLocal(inFile, outFile, stockShape, blockSize, cellInfos, sindex, overwrite);
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

/**
 * Repartitions a file on local machine without MapReduce jobs.
 * @param inFs/*ww  w .ja  va 2s. co m*/
 * @param in
 * @param outFs
 * @param out
 * @param cells
 * @param stockShape
 * @param rtree
 * @param overwrite
 * @throws IOException 
 */
public static <S extends Shape> void repartitionLocal(Path in, Path out, S stockShape, long blockSize,
        CellInfo[] cells, String sindex, boolean overwrite) throws IOException {
    FileSystem inFs = in.getFileSystem(new Configuration());
    FileSystem outFs = out.getFileSystem(new Configuration());
    // Overwrite output file
    if (outFs.exists(out)) {
        if (overwrite)
            outFs.delete(out, true);
        else
            throw new RuntimeException(
                    "Output file '" + out + "' already exists and overwrite flag is not set");
    }
    outFs.mkdirs(out);

    ShapeRecordWriter<Shape> writer;
    boolean pack = sindex.equals("r+tree");
    boolean expand = sindex.equals("rtree");
    if (sindex.equals("grid")) {
        writer = new GridRecordWriter<Shape>(out, null, null, cells, pack, expand);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        writer = new RTreeGridRecordWriter<Shape>(out, null, null, cells, pack, expand);
        writer.setStockObject(stockShape);
    } else {
        throw new RuntimeException("Unupoorted spatial idnex: " + sindex);
    }

    FileStatus inFileStatus = inFs.getFileStatus(in);
    // Copy blocksize from source file if it's globally indexed
    if (blockSize == 0) {
        GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, in);
        if (globalIndex != null) {
            blockSize = inFs.getFileStatus(new Path(in, globalIndex.iterator().next().filename)).getBlockSize();
        }
    }
    if (blockSize != 0)
        ((GridRecordWriter<Shape>) writer).setBlockSize(blockSize);

    long length = inFileStatus.getLen();
    FSDataInputStream datain = inFs.open(in);
    ShapeRecordReader<S> reader = new ShapeRecordReader<S>(datain, 0, length);
    Prism c = reader.createKey();

    NullWritable dummy = NullWritable.get();

    while (reader.next(c, stockShape)) {
        writer.write(dummy, stockShape);
    }
    writer.close(null);
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

public static <T extends TextSerializable, O extends TextSerializable> int sampleWithRatio(FileSystem fs,
        Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj,
        O outObj) throws IOException {
    FileStatus inFStatus = fs.getFileStatus(files[0]);
    if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) {
        // Either a directory of file or a large file
        return sampleMapReduceWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj);
    } else {//from   w  w  w  .j ava 2  s  .  c o  m
        // A single small file, process it without MapReduce
        return sampleLocalWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj);
    }
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalWithRatio(FileSystem fs,
        Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj,
        O outObj) throws IOException {
    long total_size = 0;
    for (Path file : files) {
        total_size += fs.getFileStatus(file).getLen();
    }/*  w ww. j  a va2 s .c  o  m*/
    sizeOfLastProcessedFile = total_size;
    return sampleLocalWithSize(fs, files, (long) (total_size * ratio), seed, output, inObj, outObj);
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Reads a sample of the given file and returns the number of items read.
 * //ww  w  .j  ava  2  s  .c o m
 * @param fs
 * @param file
 * @param count
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs,
        Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException {
    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    ResultCollector<T> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(seed);
    long[] offsets = new long[count];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < count) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        // Open a stream to the current file and use it to read all samples
        // in this file
        FSDataInputStream current_file_in = fs.open(files[file_i]);
        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];

        // The start and end offsets of data within this block
        // offsets are calculated relative to file start
        long data_start_offset = 0;
        if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) {
            // This file is an RTree file. Update the start offset to point
            // to the first byte after the header
            data_start_offset = 8 + RTree.getHeaderSize(current_file_in);
        }
        // Get the end offset of data by searching for the beginning of the
        // last line
        long data_end_offset = current_file_size;
        // Skip the last line too to ensure to ensure that the mapped position
        // will be before some line in the block
        current_file_in.seek(data_end_offset);
        data_end_offset = Tail.tail(current_file_in, 1, null, null);
        long file_data_size = data_end_offset - data_start_offset;

        // Keep sampling as long as records offsets are within this file
        while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) {
            offsets[record_i] -= files_start_offset[file_i];
            // Map file position to element index in this tree assuming fixed
            // size records
            long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size
                    + data_start_offset;
            current_file_in.seek(element_offset_in_file);
            LineReader reader = new LineReader(current_file_in, 4096);
            // Read the first line after that offset
            Text line = new Text();
            reader.readLine(line); // Skip the rest of the current line
            reader.readLine(line); // Read next line

            // Report this element to output
            if (converter != null) {
                inObj.fromText(line);
                converter.collect(inObj);
            }
            record_i++;
            records_returned++;
        }
        current_file_in.close();
    }
    return records_returned;
}

From source file:com.ricemap.spateDB.operations.Tail.java

License:Apache License

/**
 * Reads a maximum of n non-empty lines from the end of the given file.
 * The position of the earliest line read is returned 
 * @param fs/*from w ww.ja v a2 s  .  c  o  m*/
 * @param file
 * @param n
 * @param stockObject
 * @param output
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable> long tail(FileSystem fs, Path file, int n, T stockObject,
        ResultCollector<T> output) throws IOException {
    FSDataInputStream in = null;
    try {
        in = fs.open(file);
        long length = fs.getFileStatus(file).getLen();
        in.seek(length);
        return tail(in, n, stockObject, output);
    } finally {
        if (in != null)
            in.close();
    }
}

From source file:com.ricemap.spateDB.util.ReadFile.java

License:Apache License

public static void main(String[] args) throws Exception {
    CommandLineArguments cla = new CommandLineArguments(args);
    Path input = cla.getPath();//from   w ww . ja v  a  2 s . c  om
    if (input == null) {
        printUsage();
        throw new RuntimeException("Illegal parameters");
    }
    Configuration conf = new Configuration();
    Path inFile = new Path(args[0]);
    FileSystem fs = inFile.getFileSystem(conf);

    long length = fs.getFileStatus(inFile).getLen();

    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile);
    if (gindex == null) {
        BlockLocation[] locations = cla.getOffset() == -1
                ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length)
                : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getOffset(), 1);
        System.out.println(locations.length + " heap blocks");
    } else {
        for (Partition p : gindex) {
            long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen();
            System.out.println(p + " --- " + partition_length);
        }
    }
}

From source file:com.rim.logdriver.admin.LogMaintenance.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // If run by Oozie, then load the Oozie conf too
    if (System.getProperty("oozie.action.conf.xml") != null) {
        conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml")));
    }/*  w w w  .  j  ava  2  s.  c  o  m*/

    // For some reason, Oozie needs some options to be set in system instead of
    // in the confiuration. So copy the configs over.
    {
        Iterator<Entry<String, String>> i = conf.iterator();
        while (i.hasNext()) {
            Entry<String, String> next = i.next();
            System.setProperty(next.getKey(), next.getValue());
        }
    }

    if (args.length < 3) {
        printUsage();
        return 1;
    }

    String userName = args[0];
    String dcNumber = args[1];
    String service = args[2];
    String date = null;
    String hour = null;
    if (args.length >= 4) {
        date = args[3];
    }
    if (args.length >= 5) {
        hour = args[4];
    }

    // Set from environment variables
    oozieUrl = getConfOrEnv(conf, "OOZIE_URL");
    String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF");
    String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF");
    String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE");
    String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE");
    String maxConcurrentMergeJobs = getConfOrEnv(conf, "MAX_CONCURRENT_MERGE_JOBS");
    String maxConcurrentFilterJobs = getConfOrEnv(conf, "MAX_CONCURRENT_FILTER_JOBS");
    String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING");
    String logdir = getConfOrEnv(conf, "logdriver.logdir.name");
    boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs"));
    String rootDir = getConfOrEnv(conf, "service.root.dir");

    boolean doMerge = true;
    boolean doArchive = true;
    boolean doDelete = true;

    if (oozieUrl == null) {
        LOG.info("OOZIE_URL is not set.  Not merging or archiving.");
        doMerge = false;
        doArchive = false;
    }
    if (zkConnectString == null) {
        LOG.error("ZK_CONNECT_STRING is not set.  Exiting.");
        return 1;
    }
    if (mergeJobPropertiesFile == null) {
        LOG.info("MERGEJOB_CONF is not set.  Not merging.");
        doMerge = false;
    }
    if (filterJobPropertiesFile == null) {
        LOG.info("FILTERJOB_CONF is not set.  Not archiving.");
        doArchive = false;
    }
    if (daysBeforeArchive == null) {
        LOG.info("DAYS_BEFORE_ARCHIVE is not set.  Not archiving.");
        doArchive = false;
    }
    if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) {
        LOG.info("DAYS_BEFORE_ARCHIVE is negative.  Not archiving.");
        doArchive = false;
    }
    if (daysBeforeDelete == null) {
        LOG.info("DAYS_BEFORE_DELETE is not set.  Not deleting.");
        doDelete = false;
    }
    if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) {
        LOG.info("DAYS_BEFORE_DELETE is negative.  Not deleting.");
        doDelete = false;
    }
    if (maxConcurrentMergeJobs == null) {
        LOG.info("MAX_CONCURRENT_MERGE_JOBS is not set.  Using default value of -1.");
        maxConcurrentMergeJobs = "-1";
    }
    if (maxConcurrentFilterJobs == null) {
        LOG.info("MAX_CONCURRENT_FILTER_JOBS is not set.  Using default value of -1.");
        maxConcurrentMergeJobs = "-1";
    }
    if (logdir == null) {
        LOG.info("LOGDRIVER_LOGDIR_NAME is not set.  Using default value of 'logs'.");
        logdir = "logs";
    }
    if (rootDir == null) {
        LOG.info("SERVICE_ROOT_DIR is not set.  Using default value of 'service'.");
        rootDir = "/service";
    }

    // Now it's safe to create our Oozie Runners.
    OozieRunner mergeOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentMergeJobs));
    Thread mergeOozieRunnerThread = new Thread(mergeOozieRunner);
    mergeOozieRunnerThread.setName("OozieRunner - Merge");
    mergeOozieRunnerThread.setDaemon(false);
    mergeOozieRunnerThread.start();

    OozieRunner filterOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentFilterJobs));
    Thread filterOozieRunnerThread = new Thread(filterOozieRunner);
    filterOozieRunnerThread.setName("OozieRunner - Filter");
    filterOozieRunnerThread.setDaemon(false);
    filterOozieRunnerThread.start();

    // Figure out what date we start filters on.
    String filterCutoffDate = "";
    if (doArchive) {
        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive));
        filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR),
                (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY));
        LOG.info("Archiving logs from before {}", filterCutoffDate);
    }
    String deleteCutoffDate = "";
    if (doDelete) {
        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete));
        deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR),
                (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY));
        LOG.info("Deleting logs from before {}", deleteCutoffDate);
    }

    long now = System.currentTimeMillis();

    // Various exceptions have been popping up here. So make sure I catch them
    // all.
    try {
        // We can hang if this fails. So make sure we abort if it fails.
        FileSystem fs = null;
        try {
            fs = FileSystem.get(conf);
            fs.exists(new Path("/")); // Test if it works.
        } catch (IOException e) {
            LOG.error("Error getting filesystem.", e);
            return 1;
        }
        // We'll need an Oozie client to check on orphaned directories.
        oozieClient = getOozieClient();

        // LockUtils are used in a couple of places
        LockUtil lu = new LockUtil(zkConnectString);

        // Patterns to recognize hour, day and incoming directories, so that they
        // can be processed.
        Pattern datePathPattern;
        Pattern hourPathPattern;
        Pattern incomingPathPattern;
        Pattern dataPathPattern;
        Pattern archivePathPattern;
        Pattern workingPathPattern;
        if (hour != null) {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")");
            hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")");
            incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/incoming");
            dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/data");
            archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/archive");
            workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/("
                    + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)");
        } else if (date != null) {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")");
            hourPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})");
            incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/incoming");
            dataPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data");
            archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/archive");
            workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date)
                    + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)");
        } else {
            datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})");
            hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})");
            incomingPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming");
            dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/"
                    + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data");
            archivePathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive");
            workingPathPattern = Pattern
                    .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/"
                            + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)");
        }

        // Do a depth first search of the directory, processing anything that
        // looks
        // interesting along the way
        Deque<Path> paths = new ArrayDeque<Path>();
        Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/");
        paths.push(rootPath);

        while (paths.size() > 0) {
            Path p = paths.pop();
            LOG.debug("{}", p.toString());

            if (!fs.exists(p)) {
                continue;
            }

            FileStatus dirStatus = fs.getFileStatus(p);
            FileStatus[] children = fs.listStatus(p);
            boolean addChildren = true;

            boolean old = dirStatus.getModificationTime() < now - WAIT_TIME;
            LOG.debug("    Was last modified {}ms ago", now - dirStatus.getModificationTime());

            if (!old) {
                LOG.debug("    Skipping, since it's not old enough.");

            } else if ((!rootPath.equals(p)) && (children.length == 0
                    || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) {
                // old and no children? Delete!
                LOG.info("    Deleting empty directory {}", p.toString());
                fs.delete(p, true);

            } else {
                Matcher matcher = datePathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking date directory");

                    // If this is already done, then skip it. So only process if it
                    // doesn't exist.
                    if (fs.exists(new Path(p, READY_MARKER)) == false) {
                        // Check each subdirectory. If they all have ready markers, then I
                        // guess we're ready.
                        boolean ready = true;
                        for (FileStatus c : children) {
                            if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) {
                                ready = false;
                                break;
                            }
                        }

                        if (ready) {
                            fs.createNewFile(new Path(p, READY_MARKER));
                        }
                    }
                }

                matcher = hourPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking hour directory");

                    // If this is already done, then skip it. So only process if it
                    // doesn't exist.
                    if (fs.exists(new Path(p, READY_MARKER)) == false) {
                        // Check each subdirectory. If they all have ready markers, then I
                        // guess we're ready.
                        boolean ready = true;
                        for (FileStatus c : children) {
                            if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) {
                                ready = false;
                                break;
                            }
                        }

                        if (ready) {
                            fs.createNewFile(new Path(p, READY_MARKER));
                        }
                    }
                }

                // Check to see if we have to run a merge
                matcher = incomingPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.debug("Checking incoming directory");
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);
                    String matchComponent = matcher.group(3);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    } else if (doMerge) {

                        // old, looks right, and has children? Run it!
                        boolean hasMatchingChildren = false;
                        boolean subdirTooYoung = false;

                        for (FileStatus child : children) {
                            if (!hasMatchingChildren) {
                                FileStatus[] grandchildren = fs.listStatus(child.getPath());
                                for (FileStatus gc : grandchildren) {
                                    if (VALID_FILE.matcher(gc.getPath().getName()).matches()) {
                                        hasMatchingChildren = true;
                                        break;
                                    }
                                }
                            }
                            if (!subdirTooYoung) {
                                if (child.getModificationTime() >= now - WAIT_TIME) {
                                    subdirTooYoung = true;
                                    LOG.debug("    Subdir {} is too young.", child.getPath());
                                }
                            }
                        }

                        if (!hasMatchingChildren) {
                            LOG.debug("    No files match the expected pattern ({})", VALID_FILE.pattern());
                        }

                        if (hasMatchingChildren && !subdirTooYoung) {
                            LOG.info("    Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(),
                                    dcNumber, service, matchDate, matchHour, matchComponent });

                            Properties oozieJobProps = new Properties();
                            oozieJobProps.load(new FileInputStream(mergeJobPropertiesFile));

                            oozieJobProps.setProperty("rootDir", rootDir);
                            oozieJobProps.setProperty("dcNumber", dcNumber);
                            oozieJobProps.setProperty("service", service);
                            oozieJobProps.setProperty("date", matchDate);
                            oozieJobProps.setProperty("hour", matchHour);
                            oozieJobProps.setProperty("component", matchComponent);
                            oozieJobProps.setProperty("user.name", userName);
                            oozieJobProps.setProperty("logdir", logdir);

                            mergeOozieRunner.submit(oozieJobProps);

                            addChildren = false;
                        }
                    }
                }

                // Check to see if we need to run a filter and archive
                matcher = dataPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);
                    String matchComponent = matcher.group(3);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) {

                        Properties oozieJobProps = new Properties();
                        oozieJobProps.load(new FileInputStream(filterJobPropertiesFile));

                        oozieJobProps.setProperty("rootDir", rootDir);
                        oozieJobProps.setProperty("dcNumber", dcNumber);
                        oozieJobProps.setProperty("service", service);
                        oozieJobProps.setProperty("date", matchDate);
                        oozieJobProps.setProperty("hour", matchHour);
                        oozieJobProps.setProperty("component", matchComponent);
                        oozieJobProps.setProperty("user.name", userName);
                        oozieJobProps.setProperty("logdir", logdir);

                        // Check to see if we should just keep all or delete all here.
                        // The filter file should be here
                        String appPath = oozieJobProps.getProperty("oozie.wf.application.path");
                        appPath = appPath.replaceFirst("\\$\\{.*?\\}", "");
                        Path filterFile = new Path(appPath + "/" + service + ".yaml");
                        LOG.info("Filter file is {}", filterFile);
                        if (fs.exists(filterFile)) {
                            List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent,
                                    fs.open(filterFile));

                            if (filters == null) {
                                LOG.warn(
                                        "    Got null when getting filters.  Not processing. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                            } else if (filters.size() == 0) {
                                LOG.warn("    Got no filters.  Not processing. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                            } else if (filters.size() == 1
                                    && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) {
                                LOG.info("    Keeping everything. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                                // Move files from data to archive
                                // delete it all!
                                String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir
                                        + "/" + matchDate + "/" + matchHour + "/" + matchComponent
                                        + "/archive/";

                                String[] moveArgs = { zkConnectString, dcNumber, service, matchDate, matchHour,
                                        matchComponent, "move " + p.toUri().getPath() + " " + destination };
                                ToolRunner.run(new Configuration(), new LockedFs(), moveArgs);
                            } else if (filters.size() == 1
                                    && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) {
                                LOG.info("    Dropping everything. {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                                // delete it all!
                                String[] delArgs = { zkConnectString, dcNumber, service, matchDate, matchHour,
                                        matchComponent, "delete " + p.toUri().getPath() };
                                ToolRunner.run(new Configuration(), new LockedFs(), delArgs);
                            } else {
                                LOG.info("    Run Filter/Archive job {} :: {} {} {} {} {}",
                                        new Object[] { p.toString(), dcNumber, service, matchDate, matchHour,
                                                matchComponent });
                                filterOozieRunner.submit(oozieJobProps);
                            }
                        } else {
                            LOG.warn("Skipping filter job, since no filter file exists");
                        }

                        addChildren = false;
                    }
                }

                matcher = archivePathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    String matchDate = matcher.group(1);
                    String matchHour = matcher.group(2);

                    String timestamp = matchDate + matchHour;

                    if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) {
                        LOG.info("Deleting old directory: {}", p);
                        fs.delete(p, true);
                        addChildren = false;
                    }
                }

                matcher = workingPathPattern.matcher(p.toUri().getPath());
                if (matcher.matches()) {
                    LOG.info("  Matches working pattern");
                    if (resetOrphanedJobs) {
                        String matchDate = matcher.group(1);
                        String matchHour = matcher.group(2);
                        String matchComponent = matcher.group(3);
                        String matchOozieJobId = matcher.group(4);

                        // Check to see what's up with the oozie job. If it's still
                        // running,
                        // we don't want to touch it.
                        Status status = null;
                        try {
                            WorkflowJob jobInfo = oozieClient.getJobInfo(matchOozieJobId);
                            status = jobInfo.getStatus();
                        } catch (OozieClientException e) {
                            if (e.getMessage() != null && e.getMessage().contains("Job does not exist")) {
                                LOG.info("Oozie job not found.  Proceeding as though job was failed.", e);
                                status = Status.FAILED;
                            } else {
                                LOG.error("Oozie client error.  Not Proceeding.", e);
                            }
                        }
                        LOG.info("  Oozie job status is {}", status);
                        if (status != null && status != Status.RUNNING && status != Status.PREP
                                && status != Status.SUSPENDED) {
                            // Move everything from working/xxx/incoming/ to incoming/
                            PathInfo lockPathInfo = new PathInfo(rootDir + "/" + dcNumber + "/" + service + "/"
                                    + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent);
                            lu.acquireWriteLock(lu.getLockPath(lockPathInfo));

                            FileStatus[] fileStatuses = fs
                                    .listStatus(new Path(p.toUri().getPath() + "/incoming/"));
                            if (fileStatuses != null) {
                                for (FileStatus fileStatus : fileStatuses) {
                                    Path toPath = new Path(fileStatus.getPath().getParent().getParent()
                                            .getParent().getParent(),
                                            "incoming/" + fileStatus.getPath().getName());

                                    LOG.info("  Moving data from {} to {}", fileStatus.getPath(), toPath);
                                    LOG.info("    mkdir {}", toPath);
                                    fs.mkdirs(toPath);

                                    Path fromDir = new Path(p.toUri().getPath(),
                                            "incoming/" + fileStatus.getPath().getName());
                                    LOG.info("    moving from {}", fromDir);
                                    FileStatus[] files = fs.listStatus(fromDir);
                                    if (files == null || files.length == 0) {
                                        LOG.info("    Nothing to move from  {}", fromDir);
                                    } else {
                                        for (FileStatus f : files) {
                                            LOG.info("    rename {} {}", f.getPath(),
                                                    new Path(toPath, f.getPath().getName()));
                                            fs.rename(f.getPath(), new Path(toPath, f.getPath().getName()));
                                        }
                                    }

                                    LOG.info("    rm {}", fileStatus.getPath().getParent().getParent());
                                    fs.delete(fileStatus.getPath().getParent().getParent(), true);
                                }

                                lu.releaseWriteLock(lu.getLockPath(lockPathInfo));

                            }
                        }
                    }

                    addChildren = false;
                }
            }

            // Add any children which are directories to the stack.
            if (addChildren) {
                for (int i = children.length - 1; i >= 0; i--) {
                    FileStatus child = children[i];
                    if (child.isDir()) {
                        paths.push(child.getPath());
                    }
                }
            }
        }

        // Since we may have deleted a bunch of directories, delete any unused
        // locks
        // from ZooKeeper.
        {
            LOG.info("Checking for unused locks in ZooKeeper");
            String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir;
            if (date != null) {
                scanPath += "/" + date;
                if (hour != null) {
                    scanPath += "/" + hour;
                }
            }

            List<LockInfo> lockInfo = lu.scan(scanPath);

            for (LockInfo li : lockInfo) {
                // Check if the lock path still exists in HDFS. If it doesn't, then
                // delete it from ZooKeeper.
                String path = li.getPath();
                String hdfsPath = path.substring(LockUtil.ROOT.length());
                if (!fs.exists(new Path(hdfsPath))) {
                    ZooKeeper zk = lu.getZkClient();

                    while (!path.equals(LockUtil.ROOT)) {
                        try {
                            zk.delete(path, -1);
                        } catch (KeeperException.NotEmptyException e) {
                            // That's fine. just stop trying then.
                            break;
                        } catch (Exception e) {
                            LOG.error("Caught exception trying to delete from ZooKeeper.", e);
                            break;
                        }
                        LOG.info("Deleted from ZooKeeper: {}", path);
                        path = path.substring(0, path.lastIndexOf('/'));
                    }

                }
            }
        }
        lu.close();

        // Now that we're done, wait for the Oozie Runner to stop, and print the
        // results.
        LOG.info("Waiting for Oozie jobs to complete.");
        mergeOozieRunner.shutdown();
        mergeOozieRunnerThread.join();
        LOG.info("Oozie Job Stats : Merge  : Started={} Succeeded={} failed={} errors={}",
                new Object[] { mergeOozieRunner.getStarted(), mergeOozieRunner.getSucceeded(),
                        mergeOozieRunner.getFailed(), mergeOozieRunner.getErrors() });

        filterOozieRunner.shutdown();
        filterOozieRunnerThread.join();
        LOG.info("Oozie Job Stats : Filter : Started={} Succeeded={} failed={} errors={}",
                new Object[] { filterOozieRunner.getStarted(), filterOozieRunner.getSucceeded(),
                        filterOozieRunner.getFailed(), filterOozieRunner.getErrors() });

    } catch (Exception e) {
        LOG.error("Unexpected exception caught.", e);
        return 1;
    }

    return 0;
}