List of usage examples for org.apache.hadoop.fs FileSystem createNewFile
public boolean createNewFile(Path f) throws IOException
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from ww w . j a va 2 s. c o m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.util.LockUtil.java
License:Apache License
/** * Create a lock file./*from w w w. ja v a 2 s . co m*/ * @param fs filesystem * @param lockFile name of the lock file * @param accept if true, and the target file exists, consider it valid. If false * and the target file exists, throw an IOException. * @throws IOException if accept is false, and the target file already exists, * or if it's a directory. */ public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) throws IOException { if (fs.exists(lockFile)) { if (!accept) throw new IOException("lock file " + lockFile + " already exists."); if (fs.getFileStatus(lockFile).isDir()) throw new IOException("lock file " + lockFile + " already exists and is a directory."); // do nothing - the file already exists. } else { // make sure parents exist fs.mkdirs(lockFile.getParent()); fs.createNewFile(lockFile); } }
From source file:com.redsqirl.workflow.server.connect.HDFSInterface.java
License:Open Source License
/** * Create a path on HDFS with properties * // w w w . java2 s. c o m * @param path * @param properties * @throws RemoteException */ @Override public String create(String path, Map<String, String> properties) throws RemoteException { String error = null; HdfsFileChecker fCh = new HdfsFileChecker(path); if (fCh.isInitialized() && !fCh.exists()) { if (properties.get(key_type) == null || properties.get(key_type).equalsIgnoreCase("directory") || properties.get(key_type).equalsIgnoreCase("file")) { try { FileSystem fs = NameNodeVar.getFS(); boolean ok; if (properties.get(key_type) == null || properties.get(key_type).equalsIgnoreCase("directory")) { ok = fs.mkdirs(new Path(path)); } else { ok = fs.createNewFile(new Path(path)); } // fs.close(); if (ok) { changeProperties(path, properties); } else { error = LanguageManagerWF.getText("HdfsInterface.createdirfail", new Object[] { path }); } } catch (IOException e) { error = LanguageManagerWF.getText("HdfsInterface.cannotcreate", new Object[] { path }); logger.error(error); logger.error(e.getMessage()); } } else { error = LanguageManagerWF.getText("HdfsInterface.typenotexists", new Object[] { properties.get(key_type) }); } } else { error = LanguageManagerWF.getText("HdfsInterface.pathexists", new Object[] { path }); } // fCh.close(); if (error != null) { logger.debug(error); } return error; }
From source file:com.rim.logdriver.admin.LogMaintenance.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // If run by Oozie, then load the Oozie conf too if (System.getProperty("oozie.action.conf.xml") != null) { conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml"))); }//from www . j a v a2 s.c om // For some reason, Oozie needs some options to be set in system instead of // in the confiuration. So copy the configs over. { Iterator<Entry<String, String>> i = conf.iterator(); while (i.hasNext()) { Entry<String, String> next = i.next(); System.setProperty(next.getKey(), next.getValue()); } } if (args.length < 3) { printUsage(); return 1; } String userName = args[0]; String dcNumber = args[1]; String service = args[2]; String date = null; String hour = null; if (args.length >= 4) { date = args[3]; } if (args.length >= 5) { hour = args[4]; } // Set from environment variables oozieUrl = getConfOrEnv(conf, "OOZIE_URL"); String mergeJobPropertiesFile = getConfOrEnv(conf, "MERGEJOB_CONF"); String filterJobPropertiesFile = getConfOrEnv(conf, "FILTERJOB_CONF"); String daysBeforeArchive = getConfOrEnv(conf, "DAYS_BEFORE_ARCHIVE"); String daysBeforeDelete = getConfOrEnv(conf, "DAYS_BEFORE_DELETE"); String maxConcurrentMergeJobs = getConfOrEnv(conf, "MAX_CONCURRENT_MERGE_JOBS"); String maxConcurrentFilterJobs = getConfOrEnv(conf, "MAX_CONCURRENT_FILTER_JOBS"); String zkConnectString = getConfOrEnv(conf, "ZK_CONNECT_STRING"); String logdir = getConfOrEnv(conf, "logdriver.logdir.name"); boolean resetOrphanedJobs = Boolean.parseBoolean(getConfOrEnv(conf, "reset.orphaned.jobs")); String rootDir = getConfOrEnv(conf, "service.root.dir"); boolean doMerge = true; boolean doArchive = true; boolean doDelete = true; if (oozieUrl == null) { LOG.info("OOZIE_URL is not set. Not merging or archiving."); doMerge = false; doArchive = false; } if (zkConnectString == null) { LOG.error("ZK_CONNECT_STRING is not set. Exiting."); return 1; } if (mergeJobPropertiesFile == null) { LOG.info("MERGEJOB_CONF is not set. Not merging."); doMerge = false; } if (filterJobPropertiesFile == null) { LOG.info("FILTERJOB_CONF is not set. Not archiving."); doArchive = false; } if (daysBeforeArchive == null) { LOG.info("DAYS_BEFORE_ARCHIVE is not set. Not archiving."); doArchive = false; } if (doArchive && Integer.parseInt(daysBeforeArchive) < 0) { LOG.info("DAYS_BEFORE_ARCHIVE is negative. Not archiving."); doArchive = false; } if (daysBeforeDelete == null) { LOG.info("DAYS_BEFORE_DELETE is not set. Not deleting."); doDelete = false; } if (doDelete && Integer.parseInt(daysBeforeDelete) < 0) { LOG.info("DAYS_BEFORE_DELETE is negative. Not deleting."); doDelete = false; } if (maxConcurrentMergeJobs == null) { LOG.info("MAX_CONCURRENT_MERGE_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (maxConcurrentFilterJobs == null) { LOG.info("MAX_CONCURRENT_FILTER_JOBS is not set. Using default value of -1."); maxConcurrentMergeJobs = "-1"; } if (logdir == null) { LOG.info("LOGDRIVER_LOGDIR_NAME is not set. Using default value of 'logs'."); logdir = "logs"; } if (rootDir == null) { LOG.info("SERVICE_ROOT_DIR is not set. Using default value of 'service'."); rootDir = "/service"; } // Now it's safe to create our Oozie Runners. OozieRunner mergeOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentMergeJobs)); Thread mergeOozieRunnerThread = new Thread(mergeOozieRunner); mergeOozieRunnerThread.setName("OozieRunner - Merge"); mergeOozieRunnerThread.setDaemon(false); mergeOozieRunnerThread.start(); OozieRunner filterOozieRunner = new OozieRunner(oozieUrl, Integer.parseInt(maxConcurrentFilterJobs)); Thread filterOozieRunnerThread = new Thread(filterOozieRunner); filterOozieRunnerThread.setName("OozieRunner - Filter"); filterOozieRunnerThread.setDaemon(false); filterOozieRunnerThread.start(); // Figure out what date we start filters on. String filterCutoffDate = ""; if (doArchive) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeArchive)); filterCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Archiving logs from before {}", filterCutoffDate); } String deleteCutoffDate = ""; if (doDelete) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.DAY_OF_MONTH, Integer.parseInt("-" + daysBeforeDelete)); deleteCutoffDate = String.format("%04d%02d%02d%02d", cal.get(Calendar.YEAR), (cal.get(Calendar.MONTH) + 1), cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.HOUR_OF_DAY)); LOG.info("Deleting logs from before {}", deleteCutoffDate); } long now = System.currentTimeMillis(); // Various exceptions have been popping up here. So make sure I catch them // all. try { // We can hang if this fails. So make sure we abort if it fails. FileSystem fs = null; try { fs = FileSystem.get(conf); fs.exists(new Path("/")); // Test if it works. } catch (IOException e) { LOG.error("Error getting filesystem.", e); return 1; } // We'll need an Oozie client to check on orphaned directories. oozieClient = getOozieClient(); // LockUtils are used in a couple of places LockUtil lu = new LockUtil(zkConnectString); // Patterns to recognize hour, day and incoming directories, so that they // can be processed. Pattern datePathPattern; Pattern hourPathPattern; Pattern incomingPathPattern; Pattern dataPathPattern; Pattern archivePathPattern; Pattern workingPathPattern; if (hour != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(" + Pattern.quote(hour) + ")/([^/]+)/working/([^/]+)_(\\d+)"); } else if (date != null) { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")"); hourPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})"); incomingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(" + Pattern.quote(date) + ")/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } else { datePathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})"); hourPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})"); incomingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/incoming"); dataPathPattern = Pattern.compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/data"); archivePathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/archive"); workingPathPattern = Pattern .compile(rootDir + "/" + Pattern.quote(dcNumber) + "/" + Pattern.quote(service) + "/" + Pattern.quote(logdir) + "/(\\d{8})/(\\d{2})/([^/]+)/working/([^/]+)_(\\d+)"); } // Do a depth first search of the directory, processing anything that // looks // interesting along the way Deque<Path> paths = new ArrayDeque<Path>(); Path rootPath = new Path(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/"); paths.push(rootPath); while (paths.size() > 0) { Path p = paths.pop(); LOG.debug("{}", p.toString()); if (!fs.exists(p)) { continue; } FileStatus dirStatus = fs.getFileStatus(p); FileStatus[] children = fs.listStatus(p); boolean addChildren = true; boolean old = dirStatus.getModificationTime() < now - WAIT_TIME; LOG.debug(" Was last modified {}ms ago", now - dirStatus.getModificationTime()); if (!old) { LOG.debug(" Skipping, since it's not old enough."); } else if ((!rootPath.equals(p)) && (children.length == 0 || (children.length == 1 && children[0].getPath().getName().equals(READY_MARKER)))) { // old and no children? Delete! LOG.info(" Deleting empty directory {}", p.toString()); fs.delete(p, true); } else { Matcher matcher = datePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking date directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } matcher = hourPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking hour directory"); // If this is already done, then skip it. So only process if it // doesn't exist. if (fs.exists(new Path(p, READY_MARKER)) == false) { // Check each subdirectory. If they all have ready markers, then I // guess we're ready. boolean ready = true; for (FileStatus c : children) { if (c.isDir() && fs.exists(new Path(c.getPath(), READY_MARKER)) == false) { ready = false; break; } } if (ready) { fs.createNewFile(new Path(p, READY_MARKER)); } } } // Check to see if we have to run a merge matcher = incomingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.debug("Checking incoming directory"); String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doMerge) { // old, looks right, and has children? Run it! boolean hasMatchingChildren = false; boolean subdirTooYoung = false; for (FileStatus child : children) { if (!hasMatchingChildren) { FileStatus[] grandchildren = fs.listStatus(child.getPath()); for (FileStatus gc : grandchildren) { if (VALID_FILE.matcher(gc.getPath().getName()).matches()) { hasMatchingChildren = true; break; } } } if (!subdirTooYoung) { if (child.getModificationTime() >= now - WAIT_TIME) { subdirTooYoung = true; LOG.debug(" Subdir {} is too young.", child.getPath()); } } } if (!hasMatchingChildren) { LOG.debug(" No files match the expected pattern ({})", VALID_FILE.pattern()); } if (hasMatchingChildren && !subdirTooYoung) { LOG.info(" Run Merge job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(mergeJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); mergeOozieRunner.submit(oozieJobProps); addChildren = false; } } } // Check to see if we need to run a filter and archive matcher = dataPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } else if (doArchive && timestamp.compareTo(filterCutoffDate) < 0) { Properties oozieJobProps = new Properties(); oozieJobProps.load(new FileInputStream(filterJobPropertiesFile)); oozieJobProps.setProperty("rootDir", rootDir); oozieJobProps.setProperty("dcNumber", dcNumber); oozieJobProps.setProperty("service", service); oozieJobProps.setProperty("date", matchDate); oozieJobProps.setProperty("hour", matchHour); oozieJobProps.setProperty("component", matchComponent); oozieJobProps.setProperty("user.name", userName); oozieJobProps.setProperty("logdir", logdir); // Check to see if we should just keep all or delete all here. // The filter file should be here String appPath = oozieJobProps.getProperty("oozie.wf.application.path"); appPath = appPath.replaceFirst("\\$\\{.*?\\}", ""); Path filterFile = new Path(appPath + "/" + service + ".yaml"); LOG.info("Filter file is {}", filterFile); if (fs.exists(filterFile)) { List<BoomFilterMapper.Filter> filters = BoomFilterMapper.loadFilters(matchComponent, fs.open(filterFile)); if (filters == null) { LOG.warn( " Got null when getting filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 0) { LOG.warn(" Got no filters. Not processing. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.KeepAllFilter) { LOG.info(" Keeping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // Move files from data to archive // delete it all! String destination = rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent + "/archive/"; String[] moveArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "move " + p.toUri().getPath() + " " + destination }; ToolRunner.run(new Configuration(), new LockedFs(), moveArgs); } else if (filters.size() == 1 && filters.get(0) instanceof BoomFilterMapper.DropAllFilter) { LOG.info(" Dropping everything. {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); // delete it all! String[] delArgs = { zkConnectString, dcNumber, service, matchDate, matchHour, matchComponent, "delete " + p.toUri().getPath() }; ToolRunner.run(new Configuration(), new LockedFs(), delArgs); } else { LOG.info(" Run Filter/Archive job {} :: {} {} {} {} {}", new Object[] { p.toString(), dcNumber, service, matchDate, matchHour, matchComponent }); filterOozieRunner.submit(oozieJobProps); } } else { LOG.warn("Skipping filter job, since no filter file exists"); } addChildren = false; } } matcher = archivePathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String timestamp = matchDate + matchHour; if (doDelete && timestamp.compareTo(deleteCutoffDate) < 0) { LOG.info("Deleting old directory: {}", p); fs.delete(p, true); addChildren = false; } } matcher = workingPathPattern.matcher(p.toUri().getPath()); if (matcher.matches()) { LOG.info(" Matches working pattern"); if (resetOrphanedJobs) { String matchDate = matcher.group(1); String matchHour = matcher.group(2); String matchComponent = matcher.group(3); String matchOozieJobId = matcher.group(4); // Check to see what's up with the oozie job. If it's still // running, // we don't want to touch it. Status status = null; try { WorkflowJob jobInfo = oozieClient.getJobInfo(matchOozieJobId); status = jobInfo.getStatus(); } catch (OozieClientException e) { if (e.getMessage() != null && e.getMessage().contains("Job does not exist")) { LOG.info("Oozie job not found. Proceeding as though job was failed.", e); status = Status.FAILED; } else { LOG.error("Oozie client error. Not Proceeding.", e); } } LOG.info(" Oozie job status is {}", status); if (status != null && status != Status.RUNNING && status != Status.PREP && status != Status.SUSPENDED) { // Move everything from working/xxx/incoming/ to incoming/ PathInfo lockPathInfo = new PathInfo(rootDir + "/" + dcNumber + "/" + service + "/" + logdir + "/" + matchDate + "/" + matchHour + "/" + matchComponent); lu.acquireWriteLock(lu.getLockPath(lockPathInfo)); FileStatus[] fileStatuses = fs .listStatus(new Path(p.toUri().getPath() + "/incoming/")); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { Path toPath = new Path(fileStatus.getPath().getParent().getParent() .getParent().getParent(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" Moving data from {} to {}", fileStatus.getPath(), toPath); LOG.info(" mkdir {}", toPath); fs.mkdirs(toPath); Path fromDir = new Path(p.toUri().getPath(), "incoming/" + fileStatus.getPath().getName()); LOG.info(" moving from {}", fromDir); FileStatus[] files = fs.listStatus(fromDir); if (files == null || files.length == 0) { LOG.info(" Nothing to move from {}", fromDir); } else { for (FileStatus f : files) { LOG.info(" rename {} {}", f.getPath(), new Path(toPath, f.getPath().getName())); fs.rename(f.getPath(), new Path(toPath, f.getPath().getName())); } } LOG.info(" rm {}", fileStatus.getPath().getParent().getParent()); fs.delete(fileStatus.getPath().getParent().getParent(), true); } lu.releaseWriteLock(lu.getLockPath(lockPathInfo)); } } } addChildren = false; } } // Add any children which are directories to the stack. if (addChildren) { for (int i = children.length - 1; i >= 0; i--) { FileStatus child = children[i]; if (child.isDir()) { paths.push(child.getPath()); } } } } // Since we may have deleted a bunch of directories, delete any unused // locks // from ZooKeeper. { LOG.info("Checking for unused locks in ZooKeeper"); String scanPath = rootDir + "/" + dcNumber + "/" + service + "/" + logdir; if (date != null) { scanPath += "/" + date; if (hour != null) { scanPath += "/" + hour; } } List<LockInfo> lockInfo = lu.scan(scanPath); for (LockInfo li : lockInfo) { // Check if the lock path still exists in HDFS. If it doesn't, then // delete it from ZooKeeper. String path = li.getPath(); String hdfsPath = path.substring(LockUtil.ROOT.length()); if (!fs.exists(new Path(hdfsPath))) { ZooKeeper zk = lu.getZkClient(); while (!path.equals(LockUtil.ROOT)) { try { zk.delete(path, -1); } catch (KeeperException.NotEmptyException e) { // That's fine. just stop trying then. break; } catch (Exception e) { LOG.error("Caught exception trying to delete from ZooKeeper.", e); break; } LOG.info("Deleted from ZooKeeper: {}", path); path = path.substring(0, path.lastIndexOf('/')); } } } } lu.close(); // Now that we're done, wait for the Oozie Runner to stop, and print the // results. LOG.info("Waiting for Oozie jobs to complete."); mergeOozieRunner.shutdown(); mergeOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Merge : Started={} Succeeded={} failed={} errors={}", new Object[] { mergeOozieRunner.getStarted(), mergeOozieRunner.getSucceeded(), mergeOozieRunner.getFailed(), mergeOozieRunner.getErrors() }); filterOozieRunner.shutdown(); filterOozieRunnerThread.join(); LOG.info("Oozie Job Stats : Filter : Started={} Succeeded={} failed={} errors={}", new Object[] { filterOozieRunner.getStarted(), filterOozieRunner.getSucceeded(), filterOozieRunner.getFailed(), filterOozieRunner.getErrors() }); } catch (Exception e) { LOG.error("Unexpected exception caught.", e); return 1; } return 0; }
From source file:com.sensei.indexing.hadoop.reduce.IndexUpdateOutputFormat.java
License:Apache License
public RecordWriter<Shard, Text> getRecordWriter(final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path perm = new Path(getWorkOutputPath(job), name); return new RecordWriter<Shard, Text>() { public void write(Shard key, Text value) throws IOException { assert (DONE.equals(value)); String shardName = key.getDirectory(); shardName = shardName.replace("/", "_"); Path doneFile = new Path(perm, DONE + "_" + shardName); if (!fs.exists(doneFile)) { fs.createNewFile(doneFile); }/*from w w w .jav a 2 s. c o m*/ } public void close(final Reporter reporter) throws IOException { } }; }
From source file:com.streamsets.pipeline.stage.destination.hdfs.metadataexecutor.HdfsMetadataExecutor.java
License:Apache License
@Override public void write(Batch batch) throws StageException { final ELVars variables = getContext().createELVars(); final FileSystem fs = hdfsConnection.getFs(); Iterator<Record> it = batch.getRecords(); while (it.hasNext()) { Record record = it.next();//from w w w . j a v a 2s .c o m RecordEL.setRecordInContext(variables, record); // Execute all configured HDFS metadata operations as target user try { hdfsConnection.getUGI().doAs((PrivilegedExceptionAction<Void>) () -> { Path workingFile = new Path(evaluate(variables, "filePath", actions.filePath)); LOG.info("Working on file: " + workingFile); // Create empty file if configured if (actions.taskType == TaskType.CREATE_EMPTY_FILE) { ensureDirectoryExists(fs, workingFile.getParent()); if (!fs.createNewFile(workingFile)) { throw new IOException("Can't create file (probably already exists): " + workingFile); } } if (actions.taskType == TaskType.CHANGE_EXISTING_FILE && (actions.shouldMoveFile || actions.shouldRename)) { Path newPath = workingFile.getParent(); String newName = workingFile.getName(); if (actions.shouldMoveFile) { newPath = new Path(evaluate(variables, "newLocation", actions.newLocation)); } if (actions.shouldRename) { newName = evaluate(variables, "newName", actions.newName); } Path destinationFile = new Path(newPath, newName); ensureDirectoryExists(fs, newPath); LOG.debug("Renaming to: {}", destinationFile); if (!fs.rename(workingFile, destinationFile)) { throw new IOException( Utils.format("Can't rename '{}' to '{}''", workingFile, destinationFile)); } workingFile = destinationFile; } if (actions.taskType.isOneOf(TaskType.CHANGE_EXISTING_FILE, TaskType.CREATE_EMPTY_FILE)) { if (actions.shouldChangeOwnership) { String newOwner = evaluate(variables, "newOwner", actions.newOwner); String newGroup = evaluate(variables, "newGroup", actions.newGroup); LOG.debug("Applying ownership: user={} and group={}", newOwner, newGroup); fs.setOwner(workingFile, newOwner, newGroup); } if (actions.shouldSetPermissions) { String stringPerms = evaluate(variables, "newPermissions", actions.newPermissions); FsPermission fsPerms = HdfsUtils.parseFsPermission(stringPerms); LOG.debug("Applying permissions: {} loaded from value '{}'", fsPerms, stringPerms); fs.setPermission(workingFile, fsPerms); } if (actions.shouldSetAcls) { String stringAcls = evaluate(variables, "newAcls", actions.newAcls); List<AclEntry> acls = AclEntry.parseAclSpec(stringAcls, true); LOG.debug("Applying ACLs: {}", stringAcls); fs.setAcl(workingFile, acls); } } if (actions.taskType == TaskType.REMOVE_FILE) { fs.delete(workingFile, true); } // Issue event with the final file name (e.g. the renamed one if applicable) actions.taskType.getEventCreator().create(getContext()).with("filepath", workingFile.toString()) .with("filename", workingFile.getName()).createAndSend(); LOG.debug("Done changing metadata on file: {}", workingFile); return null; }); } catch (Throwable e) { // Hadoop libraries will wrap any non InterruptedException, RuntimeException, Error or IOException to UndeclaredThrowableException, // so we manually unwrap it here and properly propagate it to user. if (e instanceof UndeclaredThrowableException) { e = e.getCause(); } LOG.error("Failure when applying metadata changes to HDFS", e); errorRecordHandler.onError( new OnRecordErrorException(record, HdfsMetadataErrors.HDFS_METADATA_000, e.getMessage())); } } }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
@Test public void testPruneFileListBySize() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os);//from ww w. j a v a 2 s. co m assertTrue(hdfs.exists(inputPath)); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[0] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 2); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * removes conf file which has already been put in prunedList * * @throws IOException/*from www.j av a 2 s.co m*/ */ @Test public void testPruneFileListRemovingConfFromPruneList() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[2]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_pruneList"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_pruneList"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[0] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[1] = hdfs.getFileStatus(expPath); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 0); }
From source file:com.twitter.hraven.etl.TestFileLister.java
License:Apache License
/** * tests the case when several files are spread out in the dir and need to be removed * * @throws IOException/*w w w.j a va 2s . c om*/ */ @Test public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException { long maxFileSize = 20L; FileStatus[] origList = new FileStatus[12]; FileSystem hdfs = FileSystem.get(UTIL.getConfiguration()); Path inputPath = new Path("/inputdir_filesize_multiple"); boolean os = hdfs.mkdirs(inputPath); assertTrue(os); assertTrue(hdfs.exists(inputPath)); Path relocationPath = new Path("/relocation_filesize_multiple"); os = hdfs.mkdirs(relocationPath); assertTrue(os); assertTrue(hdfs.exists(relocationPath)); Path emptyFile = new Path( inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist"); os = hdfs.createNewFile(emptyFile); assertTrue(os); assertTrue(hdfs.exists(emptyFile)); origList[0] = hdfs.getFileStatus(emptyFile); Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml"); os = hdfs.createNewFile(emptyConfFile); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile)); origList[1] = hdfs.getFileStatus(emptyConfFile); final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist"; File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME); Path srcPath = new Path(jobHistoryfile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[2] = hdfs.getFileStatus(expPath); final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml"; File jobConfFile = new File(JOB_CONF_FILE_NAME); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputPath); expPath = new Path(inputPath.toUri() + "/" + srcPath.getName()); assertTrue(hdfs.exists(expPath)); origList[3] = hdfs.getFileStatus(expPath); Path inputPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath2); assertTrue(hdfs.exists(inputPath2)); origList[4] = hdfs.getFileStatus(inputPath2); Path inputPath3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath3); assertTrue(hdfs.exists(inputPath3)); origList[5] = hdfs.getFileStatus(inputPath3); Path inputPath4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist"); hdfs.copyFromLocalFile(srcPath, inputPath4); assertTrue(hdfs.exists(inputPath4)); origList[6] = hdfs.getFileStatus(inputPath4); Path emptyFile2 = new Path( inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist"); os = hdfs.createNewFile(emptyFile2); assertTrue(os); assertTrue(hdfs.exists(emptyFile2)); origList[7] = hdfs.getFileStatus(emptyFile2); Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml"); os = hdfs.createNewFile(emptyConfFile2); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile2)); origList[8] = hdfs.getFileStatus(emptyConfFile2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml"); os = hdfs.createNewFile(emptyConfFile3); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile3)); origList[9] = hdfs.getFileStatus(emptyConfFile3); Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml"); srcPath = new Path(jobConfFile.toURI()); hdfs.copyFromLocalFile(srcPath, inputConfPath2); assertTrue(hdfs.exists(inputConfPath2)); origList[10] = hdfs.getFileStatus(inputConfPath2); // this is an empty file which tests the toBeRemovedFileList // at the end of function pruneFileListBySize Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml"); os = hdfs.createNewFile(emptyConfFile4); assertTrue(os); assertTrue(hdfs.exists(emptyConfFile4)); origList[11] = hdfs.getFileStatus(emptyConfFile4); FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); assertNotNull(prunedList); assertTrue(prunedList.length == 4); }
From source file:com.twitter.pycascading.MetaScheme.java
License:Apache License
public void sink(FlowProcess flowProcess, SinkCall sinkCall) throws IOException { if (firstLine) { Path path = new Path(outputPath + "/" + headerFileName); FileSystem fs = path.getFileSystem(((HadoopFlowProcess) flowProcess).getJobConf()); FSDataOutputStream fsdos = null; try {/*from w w w .j a va2 s . co m*/ if (fs.createNewFile(path)) { fsdos = fs.create(path, true); boolean firstField = true; for (Comparable<?> field : sinkCall.getOutgoingEntry().getFields()) { if (firstField) firstField = false; else fsdos.writeBytes("\t"); fsdos.writeBytes(field.toString()); } fsdos.writeBytes("\t"); } } catch (IOException ignored) { } finally { if (null != fsdos) { fsdos.close(); } } // TODO: moar path = new Path(outputPath + "/" + schemeFileName); ObjectOutputStream oos = null; try { if (fs.createNewFile(path)) { fsdos = fs.create(path, true); oos = new ObjectOutputStream(fsdos); oos.writeObject(scheme); oos.writeObject(sinkCall.getOutgoingEntry().getFields()); } } catch (IOException ignored) { } finally { if (null != fsdos) { fsdos.close(); } if (null != oos) { oos.close(); } } } firstLine = false; if (typeFileToWrite) { Path path = new Path(outputPath + "/" + typeFileName); FileSystem fs = path.getFileSystem(((HadoopFlowProcess) flowProcess).getJobConf()); TupleEntry tupleEntry = null; FSDataOutputStream fsdos = null; try { if (fs.createNewFile(path)) { fsdos = fs.create(path, true); tupleEntry = sinkCall.getOutgoingEntry(); for (int i = 0; i < tupleEntry.size(); i++) { Comparable fieldName = null; if (tupleEntry.getFields().size() < tupleEntry.size()) { // We don't have names for the fields fieldName = ""; } else { fieldName = tupleEntry.getFields().get(i) + "\t"; } Object object = tupleEntry.getObject(i); Class<?> objectClass = (object == null ? Object.class : object.getClass()); fsdos.writeBytes(fieldName + objectClass.getName() + "\n"); } } } catch (IOException e) { } finally { if (null != fsdos) { fsdos.close(); } } typeFileToWrite = false; } scheme.sink(flowProcess, sinkCall); }