List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
/** * Repartitions a file that is already in HDFS. It runs a MapReduce job * that partitions the file into cells, and writes each cell separately. * @param conf/* w w w.ja v a 2 s.co m*/ * @param inFile * @param outPath * @param gridInfo * @param stockShape * @param pack * @param rtree * @param overwrite * @throws IOException */ public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize, String sindex, boolean overwrite, boolean columnar) throws IOException { FileSystem inFs = inFile.getFileSystem(new Configuration()); FileSystem outFs = outPath.getFileSystem(new Configuration()); // Calculate number of partitions in output file // Copy blocksize from source file if it's globally indexed if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize(); } else { blockSize = outFs.getDefaultBlockSize(outPath); } } // Calculate the dimensions of each partition based on gindex type CellInfo[] cellInfos; if (sindex.equals("grid")) { Prism input_mbr = FileMBR.fileMBRMapReduce(inFs, inFile, stockShape, false); long inFileSize = FileMBR.sizeOfLastProcessedFile; int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outPath, blockSize); GridInfo gridInfo = new GridInfo(input_mbr.t1, input_mbr.x1, input_mbr.y1, input_mbr.t2, input_mbr.x2, input_mbr.y2); gridInfo.calculateCellDimensions(num_partitions); cellInfos = gridInfo.getAllCells(); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { // Pack in Prisms using an RTree cellInfos = packInPrisms(inFs, inFile, outFs, outPath, blockSize, stockShape); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } repartitionMapReduce(inFile, outPath, stockShape, blockSize, cellInfos, sindex, overwrite, columnar); }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
/** * Repartitions an input file according to the given list of cells. * @param inFile/*www . ja va2 s. c o m*/ * @param outPath * @param cellInfos * @param pack * @param rtree * @param overwrite * @throws IOException */ public static void repartitionMapReduce(Path inFile, Path outPath, Shape stockShape, long blockSize, CellInfo[] cellInfos, String sindex, boolean overwrite, boolean columnar) throws IOException { JobConf job = new JobConf(Repartition.class); job.setJobName("Repartition"); FileSystem outFs = outPath.getFileSystem(job); // Overwrite output file if (outFs.exists(outPath)) { if (overwrite) outFs.delete(outPath, true); else throw new RuntimeException( "Output file '" + outPath + "' already exists and overwrite flag is not set"); } // Decide which map function to use depending on the type of global index if (sindex.equals("rtree")) { // Repartition without replication job.setMapperClass(RepartitionMapNoReplication.class); } else { // Repartition with replication (grid and r+tree) job.setMapperClass(RepartitionMap.class); } job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(stockShape.getClass()); ShapeInputFormat.setInputPaths(job, inFile); job.setInputFormat(ShapeInputFormat.class); boolean pack = sindex.equals("r+tree"); boolean expand = sindex.equals("rtree"); job.setBoolean(SpatialSite.PACK_CELLS, pack); job.setBoolean(SpatialSite.EXPAND_CELLS, expand); job.setStrings(SpatialSite.STORAGE_MODE, columnar ? "columnar" : "normal"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); // Set default parameters for reading input file SpatialSite.setShapeClass(job, stockShape.getClass()); FileOutputFormat.setOutputPath(job, outPath); if (sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { // For now, the two types of local index are the same job.setOutputFormat(RTreeGridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } // Copy block size from source file if it's globally indexed FileSystem inFs = inFile.getFileSystem(job); if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize(); LOG.info("Automatically setting block size to " + blockSize); } } if (blockSize != 0) job.setLong(SpatialSite.LOCAL_INDEX_BLOCK_SIZE, blockSize); SpatialSite.setCells(job, cellInfos); job.setBoolean(SpatialSite.OVERWRITE, overwrite); // Set reduce function job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); // Set output committer that combines output files together job.setOutputCommitter(RepartitionOutputCommitter.class); JobClient.runJob(job); }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, S stockShape, long blockSize, String sindex, boolean overwrite) throws IOException { FileSystem inFs = inFile.getFileSystem(new Configuration()); FileSystem outFs = outFile.getFileSystem(new Configuration()); // Calculate number of partitions in output file // Copy blocksize from source file if it's globally indexed if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, inFile); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(inFile, globalIndex.iterator().next().filename)) .getBlockSize();//from w w w . j ava 2 s.c o m } } // Calculate the dimensions of each partition based on gindex type CellInfo[] cellInfos; if (sindex.equals("grid")) { Prism input_mbr = FileMBR.fileMBRLocal(inFs, inFile, stockShape); long inFileSize = FileMBR.sizeOfLastProcessedFile; int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outFile, blockSize); GridInfo gridInfo = new GridInfo(input_mbr.t1, input_mbr.x1, input_mbr.y1, input_mbr.t2, input_mbr.x2, input_mbr.y2); gridInfo.calculateCellDimensions(num_partitions); cellInfos = gridInfo.getAllCells(); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { cellInfos = packInPrisms(inFs, inFile, outFs, outFile, blockSize, stockShape); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } repartitionLocal(inFile, outFile, stockShape, blockSize, cellInfos, sindex, overwrite); }
From source file:com.ricemap.spateDB.operations.Repartition.java
License:Apache License
/** * Repartitions a file on local machine without MapReduce jobs. * @param inFs/*ww w .ja va 2s. co m*/ * @param in * @param outFs * @param out * @param cells * @param stockShape * @param rtree * @param overwrite * @throws IOException */ public static <S extends Shape> void repartitionLocal(Path in, Path out, S stockShape, long blockSize, CellInfo[] cells, String sindex, boolean overwrite) throws IOException { FileSystem inFs = in.getFileSystem(new Configuration()); FileSystem outFs = out.getFileSystem(new Configuration()); // Overwrite output file if (outFs.exists(out)) { if (overwrite) outFs.delete(out, true); else throw new RuntimeException( "Output file '" + out + "' already exists and overwrite flag is not set"); } outFs.mkdirs(out); ShapeRecordWriter<Shape> writer; boolean pack = sindex.equals("r+tree"); boolean expand = sindex.equals("rtree"); if (sindex.equals("grid")) { writer = new GridRecordWriter<Shape>(out, null, null, cells, pack, expand); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { writer = new RTreeGridRecordWriter<Shape>(out, null, null, cells, pack, expand); writer.setStockObject(stockShape); } else { throw new RuntimeException("Unupoorted spatial idnex: " + sindex); } FileStatus inFileStatus = inFs.getFileStatus(in); // Copy blocksize from source file if it's globally indexed if (blockSize == 0) { GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(inFs, in); if (globalIndex != null) { blockSize = inFs.getFileStatus(new Path(in, globalIndex.iterator().next().filename)).getBlockSize(); } } if (blockSize != 0) ((GridRecordWriter<Shape>) writer).setBlockSize(blockSize); long length = inFileStatus.getLen(); FSDataInputStream datain = inFs.open(in); ShapeRecordReader<S> reader = new ShapeRecordReader<S>(datain, 0, length); Prism c = reader.createKey(); NullWritable dummy = NullWritable.get(); while (reader.next(c, stockShape)) { writer.write(dummy, stockShape); } writer.close(null); }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
public static <T extends TextSerializable, O extends TextSerializable> int sampleWithRatio(FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { FileStatus inFStatus = fs.getFileStatus(files[0]); if (inFStatus.isDir() || inFStatus.getLen() / inFStatus.getBlockSize() > 1) { // Either a directory of file or a large file return sampleMapReduceWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj); } else {//from w w w .j ava 2 s . c o m // A single small file, process it without MapReduce return sampleLocalWithRatio(fs, files, ratio, threshold, seed, output, inObj, outObj); } }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalWithRatio(FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { long total_size = 0; for (Path file : files) { total_size += fs.getFileStatus(file).getLen(); }/* w ww. j a va2 s .c o m*/ sizeOfLastProcessedFile = total_size; return sampleLocalWithSize(fs, files, (long) (total_size * ratio), seed, output, inObj, outObj); }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Reads a sample of the given file and returns the number of items read. * //ww w .j ava 2 s .c o m * @param fs * @param file * @param count * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs, Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException { ArrayList<Path> data_files = new ArrayList<Path>(); for (Path file : files) { if (fs.getFileStatus(file).isDir()) { // Directory, process all data files in this directory (visible files) FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter); for (FileStatus f : fileStatus) { data_files.add(f.getPath()); } } else { // File, process this file data_files.add(file); } } files = data_files.toArray(new Path[data_files.size()]); ResultCollector<T> converter = createConverter(output, inObj, outObj); long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes long total_length = 0; for (int i_file = 0; i_file < files.length; i_file++) { files_start_offset[i_file] = total_length; total_length += fs.getFileStatus(files[i_file]).getLen(); } files_start_offset[files.length] = total_length; // Generate offsets to read from and make sure they are ordered to minimize // seeks between different HDFS blocks Random random = new Random(seed); long[] offsets = new long[count]; for (int i = 0; i < offsets.length; i++) { if (total_length == 0) offsets[i] = 0; else offsets[i] = Math.abs(random.nextLong()) % total_length; } Arrays.sort(offsets); int record_i = 0; // Number of records read so far int records_returned = 0; int file_i = 0; // Index of the current file being sampled while (record_i < count) { // Skip to the file that contains the next sample while (offsets[record_i] > files_start_offset[file_i + 1]) file_i++; // Open a stream to the current file and use it to read all samples // in this file FSDataInputStream current_file_in = fs.open(files[file_i]); long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i]; // The start and end offsets of data within this block // offsets are calculated relative to file start long data_start_offset = 0; if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) { // This file is an RTree file. Update the start offset to point // to the first byte after the header data_start_offset = 8 + RTree.getHeaderSize(current_file_in); } // Get the end offset of data by searching for the beginning of the // last line long data_end_offset = current_file_size; // Skip the last line too to ensure to ensure that the mapped position // will be before some line in the block current_file_in.seek(data_end_offset); data_end_offset = Tail.tail(current_file_in, 1, null, null); long file_data_size = data_end_offset - data_start_offset; // Keep sampling as long as records offsets are within this file while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) { offsets[record_i] -= files_start_offset[file_i]; // Map file position to element index in this tree assuming fixed // size records long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size + data_start_offset; current_file_in.seek(element_offset_in_file); LineReader reader = new LineReader(current_file_in, 4096); // Read the first line after that offset Text line = new Text(); reader.readLine(line); // Skip the rest of the current line reader.readLine(line); // Read next line // Report this element to output if (converter != null) { inObj.fromText(line); converter.collect(inObj); } record_i++; records_returned++; } current_file_in.close(); } return records_returned; }
From source file:com.ricemap.spateDB.operations.Tail.java
License:Apache License
/** * Reads a maximum of n non-empty lines from the end of the given file. * The position of the earliest line read is returned * @param fs/*from w ww.ja v a2 s . c o m*/ * @param file * @param n * @param stockObject * @param output * @return * @throws IOException */ public static <T extends TextSerializable> long tail(FileSystem fs, Path file, int n, T stockObject, ResultCollector<T> output) throws IOException { FSDataInputStream in = null; try { in = fs.open(file); long length = fs.getFileStatus(file).getLen(); in.seek(length); return tail(in, n, stockObject, output); } finally { if (in != null) in.close(); } }
From source file:com.ricemap.spateDB.util.ReadFile.java
License:Apache License
public static void main(String[] args) throws Exception { CommandLineArguments cla = new CommandLineArguments(args); Path input = cla.getPath();//from w ww . ja v a 2 s . c om if (input == null) { printUsage(); throw new RuntimeException("Illegal parameters"); } Configuration conf = new Configuration(); Path inFile = new Path(args[0]); FileSystem fs = inFile.getFileSystem(conf); long length = fs.getFileStatus(inFile).getLen(); GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inFile); if (gindex == null) { BlockLocation[] locations = cla.getOffset() == -1 ? fs.getFileBlockLocations(fs.getFileStatus(inFile), 0, length) : fs.getFileBlockLocations(fs.getFileStatus(inFile), cla.getOffset(), 1); System.out.println(locations.length + " heap blocks"); } else { for (Partition p : gindex) { long partition_length = fs.getFileStatus(new Path(inFile, p.filename)).getLen(); System.out.println(p + " --- " + partition_length); } } }
From source file:com.rim.logdriver.admin.LogMaintenance.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // If run by Oozie, then load the Oozie conf too if (System.getProperty("oozie.action.conf.xml") != null) { conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml"))); }/* w w w . j ava 2 s. c o m*/ // For some reason, Oozie needs some options to be set in system instead of // in the confiuration. So copy the configs over. { Iterator<Entry<String, String>> i = conf.iterator(); while (i.hasNext()) { Entry<String, String> next = i.next(); System.setProperty(next.getKey(), next.getValue()); } } if (args.length < 3) { printUsage(); return 1; } String userName = args[0]; String dcNumber = args[1]; String service = args[2]; String date = null; String hour = null; if (args.length >= 4) { date = args[3]; } if (args.length >= 5) { hour = args[4]; } // Set from environment variables oozieUrl = getConfOrEnv(conf, "OOZIE_URL"); String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF"); String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF"); String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE"); String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE"); String maxConcurrentMergeJobs = getConfOrEnv(conf, "MAX_CONCURRENT_MERGE_JOBS"); String maxConcurrentFilterJobs = getConfOrEnv(conf, "MAX_CONCURRENT_FILTER_JOBS"); String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING"); String logdir = getConfOrEnv(conf, "logdriver.logdir.name"); boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs")); String rootDir = getConfOrEnv(conf, "service.root.dir"); boolean doMerge = true; boolean doArchive = true; boolean doDelete = true; if (oozieUrl == null) { LOG.info("OOZIE_URL is not set. Not merging or archiving."); doMerge = false; doArchive = false; } if (zkConnectString == null) { LOG.error("ZK_CONNECT_STRING is not set. Exiting."); return 1; } if (mergeJobPropertiesFile == null) { LOG.info("MERGEJOB_CONF is not set. Not merging."); doMerge = false; } if (filterJobPropertiesFile == null) { LOG.info("FILTERJOB_CONF is not set. Not archiving."); doArchive = false; } if (daysBeforeArchive == null) { LOG.info("DAYS_BEFORE_ARCHIVE is not set. Not archiving."); doArchive = false; } if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) { LOG.info("DAYS_BEFORE_ARCHIVE is negative. Not archiving."); doArchive = false; } if (daysBeforeDelete == null) { LOG.info("DAYS_BEFORE_DELETE is not set. Not deleting."); doDelete = false; } if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) { LOG.info("DAYS_BEFORE_DELETE is negative. Not deleting."); doDelete = false; } if (maxConcurrentMergeJobs == null) { LOG.info("MAX_CONCURRENT_MERGE_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (maxConcurrentFilterJobs == null) { LOG.info("MAX_CONCURRENT_FILTER_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (logdir == null) { LOG.info("LOGDRIVER_LOGDIR_NAME is not set. Using default value of 'logs'."); logdir = "logs"; } if (rootDir == null) { LOG.info("SERVICE_ROOT_DIR is not set. Using default value of 'service'."); rootDir = "/service"; } // Now it's safe to create our Oozie Runners. OozieRunner mergeOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentMergeJobs)); Thread mergeOozieRunnerThread = new Thread(mergeOozieRunner); mergeOozieRunnerThread.setName("OozieRunner - Merge"); mergeOozieRunnerThread.setDaemon(false); mergeOozieRunnerThread.start(); OozieRunner filterOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentFilterJobs)); Thread filterOozieRunnerThread = new Thread(filterOozieRunner); filterOozieRunnerThread.setName("OozieRunner - Filter"); filterOozieRunnerThread.setDaemon(false); filterOozieRunnerThread.start(); // Figure out what date we start filters on. String filterCutoffDate = ""; if (doArchive) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive)); filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Archiving logs from before {}", filterCutoffDate); } String deleteCutoffDate = ""; if (doDelete) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete)); deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Deleting logs from before {}", deleteCutoffDate); } long now = System.currentTimeMillis(); // Various exceptions have been popping up here. So make sure I catch them // all. try { // We can hang if this fails. So make sure we abort if it fails. FileSystem fs = null; try { fs = FileSystem.get(conf); fs.exists(new Path("/")); // Test if it works. } catch (IOException e) { LOG.error("Error getting filesystem.", e); return 1; } // We'll need an Oozie client to check on orphaned directories. oozieClient = getOozieClient(); // LockUtils are used in a couple of places LockUtil lu = new LockUtil(zkConnectString); // Patterns to recognize hour, day and incoming directories, so that they // can be processed. Pattern datePathPattern; Pattern hourPathPattern; Pattern incomingPathPattern; Pattern dataPathPattern; Pattern archivePathPattern; Pattern workingPathPattern; if (hour != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)"); } else if (date != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } else { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})"); incomingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } // Do a depth first search of the directory, processing anything that // looks // interesting along the way Deque<Path> paths = new ArrayDeque<Path>(); Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/"); paths.push(rootPath); while (paths.size() > 0) { Path p = paths.pop(); LOG.debug("{}", p.toString()); if (!fs.exists(p)) { continue; } FileStatus dirStatus = fs.getFileStatus(p); FileStatus[] children = fs.listStatus(p); boolean addChildren = true; boolean old = dirStatus.getModificationTime() < now - WAIT_TIME; LOG.debug(" Was last modified {}ms ago", now - dirStatus.getModificationTime()); if (!old) { LOG.debug(" Skipping, since it's not old enough."); } else if ((!rootPath.equals(p)) && (children.length == 0 || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) { // old and no children? Delete! LOG.info(" Deleting empty directory {}", p.toString()); fs.delete(p, true); } else { Matcher matcher = datePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking date directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } matcher = hourPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking hour directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } // Check to see if we have to run a merge matcher = incomingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking incoming directory"); String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doMerge) { // old, looks right, and has children? Run it! boolean hasMatchingChildren = false; boolean subdirTooYoung = false; for (FileStatus child : children) { if (!hasMatchingChildren) { FileStatus[] grandchildren = fs.listStatus(child.getPath()); for (FileStatus gc : grandchildren) { if (VALID_FILE.matcher(gc.getPath().getName()).matches()) { hasMatchingChildren = true; break; } } } if (!subdirTooYoung) { if (child.getModificationTime() >= now - WAIT_TIME) { subdirTooYoung = true; LOG.debug(" Subdir {} is too young.", child.getPath()); } } } if (!hasMatchingChildren) { LOG.debug(" No files match the expected pattern ({})", VALID_FILE.pattern()); } if (hasMatchingChildren && !subdirTooYoung) { LOG.info(" Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(mergeJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); mergeOozieRunner.submit(oozieJobProps); addChildren = false; } } } // Check to see if we need to run a filter and archive matcher = dataPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) { Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(filterJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); // Check to see if we should just keep all or delete all here. // The filter file should be here String appPath = oozieJobProps.getProperty("oozie.wf.application.path"); appPath = appPath.replaceFirst("\\$\\{.*?\\}", ""); Path filterFile = new Path(appPath + "/" + service + ".yaml"); LOG.info("Filter file is {}", filterFile); if (fs.exists(filterFile)) { List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent, fs.open(filterFile)); if (filters == null) { LOG.warn( " Got null when getting filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 0) { LOG.warn(" Got no filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) { LOG.info(" Keeping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // Move files from data to archive // delete it all! String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent + "/archive/"; String[] moveArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "move " + p.toUri().getPath() + " " + destination }; ToolRunner.run(new Configuration(), new LockedFs(), moveArgs); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) { LOG.info(" Dropping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // delete it all! String[] delArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "delete " + p.toUri().getPath() }; ToolRunner.run(new Configuration(), new LockedFs(), delArgs); } else { LOG.info(" Run Filter/Archive job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); filterOozieRunner.submit(oozieJobProps); } } else { LOG.warn("Skipping filter job, since no filter file exists"); } addChildren = false; } } matcher = archivePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } } matcher = workingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.info(" Matches working pattern"); if (resetOrphanedJobs) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String matchOozieJobId = matcher.group(4); // Check to see what's up with the oozie job. If it's still // running, // we don't want to touch it. Status status = null; try { WorkflowJob jobInfo = oozieClient.getJobInfo(matchOozieJobId); status = jobInfo.getStatus(); } catch (OozieClientException e) { if (e.getMessage() != null && e.getMessage().contains("Job does not exist")) { LOG.info("Oozie job not found. Proceeding as though job was failed.", e); status = Status.FAILED; } else { LOG.error("Oozie client error. Not Proceeding.", e); } } LOG.info(" Oozie job status is {}", status); if (status != null && status != Status.RUNNING && status != Status.PREP && status != Status.SUSPENDED) { // Move everything from working/xxx/incoming/ to incoming/ PathInfo lockPathInfo = new PathInfo(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent); lu.acquireWriteLock(lu.getLockPath(lockPathInfo)); FileStatus[] fileStatuses = fs .listStatus(new Path(p.toUri().getPath() + "/incoming/")); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { Path toPath = new Path(fileStatus.getPath().getParent().getParent() .getParent().getParent(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" Moving data from {} to {}", fileStatus.getPath(), toPath); LOG.info(" mkdir {}", toPath); fs.mkdirs(toPath); Path fromDir = new Path(p.toUri().getPath(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" moving from {}", fromDir); FileStatus[] files = fs.listStatus(fromDir); if (files == null || files.length == 0) { LOG.info(" Nothing to move from {}", fromDir); } else { for (FileStatus f : files) { LOG.info(" rename {} {}", f.getPath(), new Path(toPath, f.getPath().getName())); fs.rename(f.getPath(), new Path(toPath, f.getPath().getName())); } } LOG.info(" rm {}", fileStatus.getPath().getParent().getParent()); fs.delete(fileStatus.getPath().getParent().getParent(), true); } lu.releaseWriteLock(lu.getLockPath(lockPathInfo)); } } } addChildren = false; } } // Add any children which are directories to the stack. if (addChildren) { for (int i = children.length - 1; i >= 0; i--) { FileStatus child = children[i]; if (child.isDir()) { paths.push(child.getPath()); } } } } // Since we may have deleted a bunch of directories, delete any unused // locks // from ZooKeeper. { LOG.info("Checking for unused locks in ZooKeeper"); String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir; if (date != null) { scanPath += "/" + date; if (hour != null) { scanPath += "/" + hour; } } List<LockInfo> lockInfo = lu.scan(scanPath); for (LockInfo li : lockInfo) { // Check if the lock path still exists in HDFS. If it doesn't, then // delete it from ZooKeeper. String path = li.getPath(); String hdfsPath = path.substring(LockUtil.ROOT.length()); if (!fs.exists(new Path(hdfsPath))) { ZooKeeper zk = lu.getZkClient(); while (!path.equals(LockUtil.ROOT)) { try { zk.delete(path, -1); } catch (KeeperException.NotEmptyException e) { // That's fine. just stop trying then. break; } catch (Exception e) { LOG.error("Caught exception trying to delete from ZooKeeper.", e); break; } LOG.info("Deleted from ZooKeeper: {}", path); path = path.substring(0, path.lastIndexOf('/')); } } } } lu.close(); // Now that we're done, wait for the Oozie Runner to stop, and print the // results. LOG.info("Waiting for Oozie jobs to complete."); mergeOozieRunner.shutdown(); mergeOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Merge : Started={} Succeeded={} failed={} errors={}", new Object[] { mergeOozieRunner.getStarted(), mergeOozieRunner.getSucceeded(), mergeOozieRunner.getFailed(), mergeOozieRunner.getErrors() }); filterOozieRunner.shutdown(); filterOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Filter : Started={} Succeeded={} failed={} errors={}", new Object[] { filterOozieRunner.getStarted(), filterOozieRunner.getSucceeded(), filterOozieRunner.getFailed(), filterOozieRunner.getErrors() }); } catch (Exception e) { LOG.error("Unexpected exception caught.", e); return 1; } return 0; }