List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
From source file:com.redsqirl.workflow.server.datatype.PigTestUtils.java
License:Open Source License
public static void createHDFSFile(Path p, String containt) throws IOException { FileSystem fileSystem = NameNodeVar.getFS(); // Check if the file already exists if (fileSystem.exists(p)) { if (fileSystem.listStatus(p).length > 0) { logger.warn("File " + p.toString() + " already exists"); return; }//from w w w. java2 s .c o m } else { fileSystem.mkdirs(p); } // Create a new file and write data to it. FSDataOutputStream out = fileSystem.create(new Path(p, "part-0000")); out.write(containt.getBytes()); out.close(); fileSystem.close(); }
From source file:com.redsqirl.workflow.server.datatype.PigTestUtils.java
License:Open Source License
public static int getPathSize(Path p) throws IOException { FileSystem fs = NameNodeVar.getFS(); int psize = 0; if (fs.exists(p)) { psize = fs.listStatus(p).length; }/*from ww w. j a va2 s .com*/ return psize; }
From source file:com.redsqirl.workflow.server.oozie.SqirlNutcrackerAction.java
License:Open Source License
/** * Create an element for Idiro Engine Action in the Oozie action file * @param oozieXmlDoc//from ww w .j a va 2 s . c o m * @param action * @param fileNames * @throws RemoteException */ @Override public void createOozieElement(Document oozieXmlDoc, Element action, String[] fileNames) throws RemoteException { logger.debug("createOozieElement SqirlNutcrackerAction "); Element java = oozieXmlDoc.createElement("java"); defaultParam(oozieXmlDoc, java); String path = WorkflowPrefManager.getProperty(WorkflowPrefManager.sys_nutcracker_path); logger.debug("createOozieElement path " + path); try { FileSystem fs = NameNodeVar.getFS(); if (fs.isDirectory(new Path(path))) { FileStatus[] fileStatus = fs.listStatus(new Path(path)); Element property = oozieXmlDoc.createElement("property"); Element confName = oozieXmlDoc.createElement("name"); confName.appendChild(oozieXmlDoc.createTextNode("oozie.launcher.fs.hdfs.impl.disable.cache")); Element confValue = oozieXmlDoc.createElement("value"); confValue.appendChild(oozieXmlDoc.createTextNode("true")); property.appendChild(confName); property.appendChild(confValue); Element configuration = (Element) java.getElementsByTagName("configuration").item(0); configuration.appendChild(property); property = oozieXmlDoc.createElement("property"); confName = oozieXmlDoc.createElement("name"); confValue = oozieXmlDoc.createElement("value"); confName.appendChild(oozieXmlDoc.createTextNode("oozie.launcher.oozie.libpath")); confValue.appendChild(oozieXmlDoc.createTextNode(path)); property.appendChild(confName); property.appendChild(confValue); configuration.appendChild(property); Element mainClass = oozieXmlDoc.createElement("main-class"); mainClass.appendChild(oozieXmlDoc.createTextNode("com.sqirlnutcracker.SqirlNutcrackerMain")); java.appendChild(mainClass); Element javaopts = oozieXmlDoc.createElement("java-opts"); javaopts.appendChild(oozieXmlDoc.createTextNode("-Duser.name=" + System.getProperty("user.name"))); java.appendChild(javaopts); Element arg1 = oozieXmlDoc.createElement("arg"); arg1.appendChild(oozieXmlDoc.createTextNode("namenode=${" + OozieManager.prop_namenode + "},jobtracker=${" + OozieManager.prop_jobtracker + "}")); java.appendChild(arg1); Element arg2 = oozieXmlDoc.createElement("arg"); String[] filename = fileNames[0].split("/"); arg2.appendChild(oozieXmlDoc.createTextNode(filename[filename.length - 1])); java.appendChild(arg2); Element file = oozieXmlDoc.createElement("file"); file.appendChild(oozieXmlDoc.createTextNode(fileNames[0])); java.appendChild(file); action.appendChild(java); } else { logger.debug("createOozieElement isDirectory false "); } } catch (IOException e) { e.printStackTrace(); logger.debug("createOozieElement error " + e); } }
From source file:com.revolutionanalytics.hadoop.hdfs.FileUtils.java
License:Apache License
private static void ls__(FileSystem srcFS, String path, ArrayList<String> lsco, boolean dorecurse) throws IOException, FileNotFoundException { Path spath = new Path(path); FileStatus[] srcs;//from w w w .j ava 2 s . com srcs = srcFS.globStatus(spath); if (srcs == null || srcs.length == 0) { throw new FileNotFoundException("Cannot access " + path + ": No such file or directory."); } if (srcs.length == 1 && srcs[0].isDir()) srcs = srcFS.listStatus(srcs[0].getPath()); Calendar c = Calendar.getInstance(); for (FileStatus status : srcs) { StringBuilder sb = new StringBuilder(); boolean idir = status.isDir(); String x = idir ? "d" : "-"; if (dorecurse && idir) ls__(srcFS, status.getPath().toUri().getPath(), lsco, dorecurse); else { sb.append(x); sb.append(status.getPermission().toString()); sb.append(fsep); sb.append(status.getOwner()); sb.append(fsep); sb.append(status.getGroup()); sb.append(fsep); sb.append(status.getLen()); sb.append(fsep); Date d = new Date(status.getModificationTime()); sb.append(formatter.format(d)); sb.append(fsep); sb.append(status.getPath().toUri().getPath()); lsco.add(sb.toString()); } } }
From source file:com.ricemap.spateDB.operations.FileMBR.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from w w w .j a v a 2 s .c o m * @param conf * @param fs * @param file * @return * @throws IOException */ public static <S extends Shape> Prism fileMBRMapReduce(FileSystem fs, Path file, S stockShape, boolean background) throws IOException { // Quickly get file MBR if it is globally indexed GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(fs, file); if (globalIndex != null) { // Return the MBR of the global index. // Compute file size by adding up sizes of all files assuming they are // not compressed long totalLength = 0; for (Partition p : globalIndex) { Path filePath = new Path(file, p.filename); if (fs.exists(filePath)) totalLength += fs.getFileStatus(filePath).getLen(); } sizeOfLastProcessedFile = totalLength; return globalIndex.getMBR(); } JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(file.toUri().getPath() + ".mbr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("FileMBR"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Prism.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeInputFormat.class); SpatialSite.setShapeClass(job, stockShape.getClass()); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); job.setOutputCommitter(MBROutputCommitter.class); // Submit the job if (background) { JobClient jc = new JobClient(job); lastSubmittedJob = jc.submitJob(job); return null; } else { lastSubmittedJob = JobClient.runJob(job); Counters counters = lastSubmittedJob.getCounters(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); FileMBR.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Read job result FileStatus[] results = outFs.listStatus(outputPath); Prism mbr = new Prism(); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { mbr.fromText(text); } lineReader.close(); } } outFs.delete(outputPath, true); return mbr; } }
From source file:com.ricemap.spateDB.operations.RecordCount.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing//from w w w .ja v a 2s.c o m * @param conf * @param fs * @param file * @return * @throws IOException */ public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException { JobConf job = new JobConf(RecordCount.class); Path outputPath = new Path(file.toUri().getPath() + ".linecount"); FileSystem outFs = outputPath.getFileSystem(job); outFs.delete(outputPath, true); job.setJobName("LineCount"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(1); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result long lineCount = 0; FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { lineCount = Long.parseLong(text.toString()); } lineReader.close(); } } outFs.delete(outputPath, true); return lineCount; }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Sample a ratio of the file through a MapReduce job * @param fs// w w w.j av a2s . co m * @param files * @param ratio * @param threshold - Maximum number of elements to be sampled * @param output * @param inObj * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio( FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output, T inObj, O outObj) throws IOException { JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("Sample"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setClass(InClass, inObj.getClass(), TextSerializable.class); job.setClass(OutClass, outObj.getClass(), TextSerializable.class); job.setMapperClass(Map.class); job.setLong(RANDOM_SEED, seed); job.setFloat(SAMPLE_RATIO, (float) ratio); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(0); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, files); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job RunningJob run_job = JobClient.runJob(job); Counters counters = run_job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Ratio of records to return from output based on the threshold // Note that any number greater than or equal to one will cause all // elements to be returned final double selectRatio = (double) threshold / resultCount; // Read job result int result_size = 0; if (output != null) { Text line = new Text(); FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); try { while (lineReader.readLine(line) > 0) { if (Math.random() < selectRatio) { if (output != null) { outObj.fromText(line); output.collect(outObj); } result_size++; } } } catch (RuntimeException e) { e.printStackTrace(); } lineReader.close(); } } } outFs.delete(outputPath, true); return result_size; }
From source file:com.rim.logdriver.admin.HFind.java
License:Apache License
@Override public int run(String[] args) throws Exception { final long startTime = System.currentTimeMillis(); int i = 0;//from w w w . ja v a 2 s . c o m while (i < args.length) { if (args[i].startsWith("-")) { break; } Path path = new Path(args[i]); FileSystem fs = path.getFileSystem(getConf()); FileStatus[] fileStatuses = fs.globStatus(path); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { paths.add(fileStatus.getPath()); fileStatusCache.put(fileStatus.getPath(), fileStatus); } } i++; } while (i < args.length) { // -print action if ("-print".equals(args[i])) { actions.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { System.out.println(fileStatus.getPath()); return true; } }); } // -delete action if ("-delete".equals(args[i])) { actions.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { try { FileSystem fs = fileStatus.getPath().getFileSystem(getConf()); if (!fileStatus.isDir() || fs.listStatus(fileStatus.getPath()).length == 0) { return fs.delete(fileStatus.getPath(), true); } } catch (IOException e) { e.printStackTrace(); } return false; } }); } // -atime test else if ("-atime".equals(args[i])) { i++; if (i >= args.length) { System.err.println("Missing arguement for -atime"); System.exit(1); } String t = args[i]; if (t.charAt(0) == '+') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (24 * 60 * 60 * 1000) > time) { return true; } else { return false; } } }); } else if (t.charAt(0) == '-') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (24 * 60 * 60 * 1000) < time) { return true; } else { return false; } } }); } else { final long time = Long.parseLong(t); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (24 * 60 * 60 * 1000) == time) { return true; } else { return false; } } }); } } // -mtime test else if ("-mtime".equals(args[i])) { i++; if (i >= args.length) { System.err.println("Missing arguement for -mtime"); System.exit(1); } String t = args[i]; if (t.charAt(0) == '+') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (24 * 60 * 60 * 1000) > time) { return true; } else { return false; } } }); } else if (t.charAt(0) == '-') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (24 * 60 * 60 * 1000) < time) { return true; } else { return false; } } }); } else { final long time = Long.parseLong(t); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (24 * 60 * 60 * 1000) == time) { return true; } else { return false; } } }); } } // -amin test else if ("-amin".equals(args[i])) { i++; if (i >= args.length) { System.err.println("Missing arguement for -amin"); System.exit(1); } String t = args[i]; if (t.charAt(0) == '+') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (60 * 1000) > time) { return true; } else { return false; } } }); } else if (t.charAt(0) == '-') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (60 * 1000) < time) { return true; } else { return false; } } }); } else { final long time = Long.parseLong(t); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getAccessTime()) / (60 * 1000) == time) { return true; } else { return false; } } }); } } // -mmin test else if ("-mmin".equals(args[i])) { i++; if (i >= args.length) { System.err.println("Missing arguement for -mmin"); System.exit(1); } String t = args[i]; if (t.charAt(0) == '+') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (60 * 1000) > time) { return true; } else { return false; } } }); } else if (t.charAt(0) == '-') { final long time = Long.parseLong(t.substring(1)); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (60 * 1000) < time) { return true; } else { return false; } } }); } else { final long time = Long.parseLong(t); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if ((startTime - fileStatus.getModificationTime()) / (60 * 1000) == time) { return true; } else { return false; } } }); } } // -regex test else if ("-regex".equals(args[i])) { i++; if (i >= args.length) { System.err.println("Missing arguement for -regex"); System.exit(1); } final Pattern p = Pattern.compile(args[i]); tests.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { if (p.matcher(fileStatus.getPath().toString()).matches()) { return true; } else { return false; } } }); } i++; } if (actions.size() == 0) { actions.add(new FileStatusFilter() { @Override public boolean accept(FileStatus fileStatus) { System.out.println(fileStatus.getPath()); return true; } }); } search(); return 0; }
From source file:com.rim.logdriver.admin.HFind.java
License:Apache License
private void search() throws IOException { Set<Path> seen = new HashSet<Path>(); while (paths.size() > 0) { // Check if the top of the list has any children. If so, add them to the // stack. If not, then process it. Path p = paths.peekFirst(); FileSystem fs = p.getFileSystem(getConf()); FileStatus fileStatus = fileStatusCache.get(p); // Only check if we haven't seen this before. if (fileStatus.isDir() && seen.contains(p) == false) { FileStatus[] fileStatuses = fs.listStatus(p); if (fileStatuses != null && fileStatuses.length > 0) { for (FileStatus x : fileStatuses) { paths.addFirst(x.getPath()); fileStatusCache.put(x.getPath(), x); }//from www .jav a 2 s. c o m seen.add(p); continue; } } // If we get here, then we should be processing the path. p = paths.removeFirst(); // If we're processing it, we won't need it's status in the cache anymore. fileStatusCache.remove(p); boolean match = true; for (FileStatusFilter test : tests) { try { if (test.accept(fileStatus) == false) { match = false; break; } } catch (Throwable t) { t.printStackTrace(); System.err.println("path=" + p + " fileStatus=" + fileStatus); } } if (match == false) { continue; } for (FileStatusFilter action : actions) { try { if (action.accept(fileStatus) == false) { match = false; break; } } catch (Throwable t) { t.printStackTrace(); System.err.println("path=" + p + " fileStatus=" + fileStatus); } } } }
From source file:com.rim.logdriver.admin.LogMaintenance.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // If run by Oozie, then load the Oozie conf too if (System.getProperty("oozie.action.conf.xml") != null) { conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml"))); }/*from w ww .j a v a 2 s .c o m*/ // For some reason, Oozie needs some options to be set in system instead of // in the confiuration. So copy the configs over. { Iterator<Entry<String, String>> i = conf.iterator(); while (i.hasNext()) { Entry<String, String> next = i.next(); System.setProperty(next.getKey(), next.getValue()); } } if (args.length < 3) { printUsage(); return 1; } String userName = args[0]; String dcNumber = args[1]; String service = args[2]; String date = null; String hour = null; if (args.length >= 4) { date = args[3]; } if (args.length >= 5) { hour = args[4]; } // Set from environment variables oozieUrl = getConfOrEnv(conf, "OOZIE_URL"); String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF"); String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF"); String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE"); String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE"); String maxConcurrentMergeJobs = getConfOrEnv(conf, "MAX_CONCURRENT_MERGE_JOBS"); String maxConcurrentFilterJobs = getConfOrEnv(conf, "MAX_CONCURRENT_FILTER_JOBS"); String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING"); String logdir = getConfOrEnv(conf, "logdriver.logdir.name"); boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs")); String rootDir = getConfOrEnv(conf, "service.root.dir"); boolean doMerge = true; boolean doArchive = true; boolean doDelete = true; if (oozieUrl == null) { LOG.info("OOZIE_URL is not set. Not merging or archiving."); doMerge = false; doArchive = false; } if (zkConnectString == null) { LOG.error("ZK_CONNECT_STRING is not set. Exiting."); return 1; } if (mergeJobPropertiesFile == null) { LOG.info("MERGEJOB_CONF is not set. Not merging."); doMerge = false; } if (filterJobPropertiesFile == null) { LOG.info("FILTERJOB_CONF is not set. Not archiving."); doArchive = false; } if (daysBeforeArchive == null) { LOG.info("DAYS_BEFORE_ARCHIVE is not set. Not archiving."); doArchive = false; } if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) { LOG.info("DAYS_BEFORE_ARCHIVE is negative. Not archiving."); doArchive = false; } if (daysBeforeDelete == null) { LOG.info("DAYS_BEFORE_DELETE is not set. Not deleting."); doDelete = false; } if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) { LOG.info("DAYS_BEFORE_DELETE is negative. Not deleting."); doDelete = false; } if (maxConcurrentMergeJobs == null) { LOG.info("MAX_CONCURRENT_MERGE_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (maxConcurrentFilterJobs == null) { LOG.info("MAX_CONCURRENT_FILTER_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (logdir == null) { LOG.info("LOGDRIVER_LOGDIR_NAME is not set. Using default value of 'logs'."); logdir = "logs"; } if (rootDir == null) { LOG.info("SERVICE_ROOT_DIR is not set. Using default value of 'service'."); rootDir = "/service"; } // Now it's safe to create our Oozie Runners. OozieRunner mergeOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentMergeJobs)); Thread mergeOozieRunnerThread = new Thread(mergeOozieRunner); mergeOozieRunnerThread.setName("OozieRunner - Merge"); mergeOozieRunnerThread.setDaemon(false); mergeOozieRunnerThread.start(); OozieRunner filterOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentFilterJobs)); Thread filterOozieRunnerThread = new Thread(filterOozieRunner); filterOozieRunnerThread.setName("OozieRunner - Filter"); filterOozieRunnerThread.setDaemon(false); filterOozieRunnerThread.start(); // Figure out what date we start filters on. String filterCutoffDate = ""; if (doArchive) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive)); filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Archiving logs from before {}", filterCutoffDate); } String deleteCutoffDate = ""; if (doDelete) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete)); deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Deleting logs from before {}", deleteCutoffDate); } long now = System.currentTimeMillis(); // Various exceptions have been popping up here. So make sure I catch them // all. try { // We can hang if this fails. So make sure we abort if it fails. FileSystem fs = null; try { fs = FileSystem.get(conf); fs.exists(new Path("/")); // Test if it works. } catch (IOException e) { LOG.error("Error getting filesystem.", e); return 1; } // We'll need an Oozie client to check on orphaned directories. oozieClient = getOozieClient(); // LockUtils are used in a couple of places LockUtil lu = new LockUtil(zkConnectString); // Patterns to recognize hour, day and incoming directories, so that they // can be processed. Pattern datePathPattern; Pattern hourPathPattern; Pattern incomingPathPattern; Pattern dataPathPattern; Pattern archivePathPattern; Pattern workingPathPattern; if (hour != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)"); } else if (date != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } else { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})"); incomingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } // Do a depth first search of the directory, processing anything that // looks // interesting along the way Deque<Path> paths = new ArrayDeque<Path>(); Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/"); paths.push(rootPath); while (paths.size() > 0) { Path p = paths.pop(); LOG.debug("{}", p.toString()); if (!fs.exists(p)) { continue; } FileStatus dirStatus = fs.getFileStatus(p); FileStatus[] children = fs.listStatus(p); boolean addChildren = true; boolean old = dirStatus.getModificationTime() < now - WAIT_TIME; LOG.debug(" Was last modified {}ms ago", now - dirStatus.getModificationTime()); if (!old) { LOG.debug(" Skipping, since it's not old enough."); } else if ((!rootPath.equals(p)) && (children.length == 0 || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) { // old and no children? Delete! LOG.info(" Deleting empty directory {}", p.toString()); fs.delete(p, true); } else { Matcher matcher = datePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking date directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } matcher = hourPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking hour directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } // Check to see if we have to run a merge matcher = incomingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking incoming directory"); String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doMerge) { // old, looks right, and has children? Run it! boolean hasMatchingChildren = false; boolean subdirTooYoung = false; for (FileStatus child : children) { if (!hasMatchingChildren) { FileStatus[] grandchildren = fs.listStatus(child.getPath()); for (FileStatus gc : grandchildren) { if (VALID_FILE.matcher(gc.getPath().getName()).matches()) { hasMatchingChildren = true; break; } } } if (!subdirTooYoung) { if (child.getModificationTime() >= now - WAIT_TIME) { subdirTooYoung = true; LOG.debug(" Subdir {} is too young.", child.getPath()); } } } if (!hasMatchingChildren) { LOG.debug(" No files match the expected pattern ({})", VALID_FILE.pattern()); } if (hasMatchingChildren && !subdirTooYoung) { LOG.info(" Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(mergeJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); mergeOozieRunner.submit(oozieJobProps); addChildren = false; } } } // Check to see if we need to run a filter and archive matcher = dataPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) { Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(filterJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); // Check to see if we should just keep all or delete all here. // The filter file should be here String appPath = oozieJobProps.getProperty("oozie.wf.application.path"); appPath = appPath.replaceFirst("\\$\\{.*?\\}", ""); Path filterFile = new Path(appPath + "/" + service + ".yaml"); LOG.info("Filter file is {}", filterFile); if (fs.exists(filterFile)) { List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent, fs.open(filterFile)); if (filters == null) { LOG.warn( " Got null when getting filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 0) { LOG.warn(" Got no filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) { LOG.info(" Keeping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // Move files from data to archive // delete it all! String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent + "/archive/"; String[] moveArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "move " + p.toUri().getPath() + " " + destination }; ToolRunner.run(new Configuration(), new LockedFs(), moveArgs); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) { LOG.info(" Dropping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // delete it all! String[] delArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "delete " + p.toUri().getPath() }; ToolRunner.run(new Configuration(), new LockedFs(), delArgs); } else { LOG.info(" Run Filter/Archive job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); filterOozieRunner.submit(oozieJobProps); } } else { LOG.warn("Skipping filter job, since no filter file exists"); } addChildren = false; } } matcher = archivePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } } matcher = workingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.info(" Matches working pattern"); if (resetOrphanedJobs) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String matchOozieJobId = matcher.group(4); // Check to see what's up with the oozie job. If it's still // running, // we don't want to touch it. Status status = null; try { WorkflowJob jobInfo = oozieClient.getJobInfo(matchOozieJobId); status = jobInfo.getStatus(); } catch (OozieClientException e) { if (e.getMessage() != null && e.getMessage().contains("Job does not exist")) { LOG.info("Oozie job not found. Proceeding as though job was failed.", e); status = Status.FAILED; } else { LOG.error("Oozie client error. Not Proceeding.", e); } } LOG.info(" Oozie job status is {}", status); if (status != null && status != Status.RUNNING && status != Status.PREP && status != Status.SUSPENDED) { // Move everything from working/xxx/incoming/ to incoming/ PathInfo lockPathInfo = new PathInfo(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent); lu.acquireWriteLock(lu.getLockPath(lockPathInfo)); FileStatus[] fileStatuses = fs .listStatus(new Path(p.toUri().getPath() + "/incoming/")); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { Path toPath = new Path(fileStatus.getPath().getParent().getParent() .getParent().getParent(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" Moving data from {} to {}", fileStatus.getPath(), toPath); LOG.info(" mkdir {}", toPath); fs.mkdirs(toPath); Path fromDir = new Path(p.toUri().getPath(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" moving from {}", fromDir); FileStatus[] files = fs.listStatus(fromDir); if (files == null || files.length == 0) { LOG.info(" Nothing to move from {}", fromDir); } else { for (FileStatus f : files) { LOG.info(" rename {} {}", f.getPath(), new Path(toPath, f.getPath().getName())); fs.rename(f.getPath(), new Path(toPath, f.getPath().getName())); } } LOG.info(" rm {}", fileStatus.getPath().getParent().getParent()); fs.delete(fileStatus.getPath().getParent().getParent(), true); } lu.releaseWriteLock(lu.getLockPath(lockPathInfo)); } } } addChildren = false; } } // Add any children which are directories to the stack. if (addChildren) { for (int i = children.length - 1; i >= 0; i--) { FileStatus child = children[i]; if (child.isDir()) { paths.push(child.getPath()); } } } } // Since we may have deleted a bunch of directories, delete any unused // locks // from ZooKeeper. { LOG.info("Checking for unused locks in ZooKeeper"); String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir; if (date != null) { scanPath += "/" + date; if (hour != null) { scanPath += "/" + hour; } } List<LockInfo> lockInfo = lu.scan(scanPath); for (LockInfo li : lockInfo) { // Check if the lock path still exists in HDFS. If it doesn't, then // delete it from ZooKeeper. String path = li.getPath(); String hdfsPath = path.substring(LockUtil.ROOT.length()); if (!fs.exists(new Path(hdfsPath))) { ZooKeeper zk = lu.getZkClient(); while (!path.equals(LockUtil.ROOT)) { try { zk.delete(path, -1); } catch (KeeperException.NotEmptyException e) { // That's fine. just stop trying then. break; } catch (Exception e) { LOG.error("Caught exception trying to delete from ZooKeeper.", e); break; } LOG.info("Deleted from ZooKeeper: {}", path); path = path.substring(0, path.lastIndexOf('/')); } } } } lu.close(); // Now that we're done, wait for the Oozie Runner to stop, and print the // results. LOG.info("Waiting for Oozie jobs to complete."); mergeOozieRunner.shutdown(); mergeOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Merge : Started={} Succeeded={} failed={} errors={}", new Object[] { mergeOozieRunner.getStarted(), mergeOozieRunner.getSucceeded(), mergeOozieRunner.getFailed(), mergeOozieRunner.getErrors() }); filterOozieRunner.shutdown(); filterOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Filter : Started={} Succeeded={} failed={} errors={}", new Object[] { filterOozieRunner.getStarted(), filterOozieRunner.getSucceeded(), filterOozieRunner.getFailed(), filterOozieRunner.getErrors() }); } catch (Exception e) { LOG.error("Unexpected exception caught.", e); return 1; } return 0; }