List of usage examples for org.apache.hadoop.mapreduce Job getStatus
public JobStatus getStatus() throws IOException, InterruptedException
From source file:org.apache.ignite.internal.processors.hadoop.impl.client.HadoopClientProtocolSelfTest.java
License:Apache License
/** * Test job submission.//from w w w . j av a 2s. c o m * * @param noCombiners Whether there are no combiners. * @param noReducers Whether there are no reducers. * @throws Exception If failed. */ public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception { IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName); igfs.mkdirs(new IgfsPath(PATH_INPUT)); try (BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) { bw.write("word"); } Configuration conf = config(HadoopAbstractSelfTest.REST_PORT); final Job job = Job.getInstance(conf); try { job.setJobName(JOB_NAME); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (!noCombiners) job.setCombinerClass(TestCombiner.class); if (noReducers) job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TestOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT)); job.submit(); JobID jobId = job.getJobID(); // Setup phase. JobStatus jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f; assert jobStatus.getMapProgress() == 0.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); JobStatus recentJobStatus = job.getStatus(); assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old=" + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress(); // Transferring to map phase. setupLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getSetupProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); // Map phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress(); // Transferring to reduce phase. mapLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getMapProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); if (!noReducers) { // Reduce phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f; // Ensure that reduces progress increases. U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old=" + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress(); reduceLockFile.delete(); } job.waitForCompletion(false); jobStatus = job.getStatus(); checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() == 1.0f; dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT)); } finally { job.getCluster().close(); } }
From source file:org.apache.kylin.engine.mr.common.HadoopJobStatusChecker.java
License:Apache License
public static JobStepStatusEnum checkStatus(Job job, StringBuilder output) { if (job == null || job.getJobID() == null) { output.append("Skip status check with empty job id..\n"); return JobStepStatusEnum.WAITING; }// ww w . j ava 2 s . co m JobStepStatusEnum status = null; try { switch (job.getStatus().getState()) { case SUCCEEDED: status = JobStepStatusEnum.FINISHED; break; case FAILED: status = JobStepStatusEnum.ERROR; break; case KILLED: status = JobStepStatusEnum.KILLED; break; case RUNNING: status = JobStepStatusEnum.RUNNING; break; case PREP: status = JobStepStatusEnum.WAITING; break; default: throw new IllegalStateException(); } } catch (Exception e) { logger.error("error check status", e); output.append("Exception: " + e.getLocalizedMessage() + "\n"); status = JobStepStatusEnum.ERROR; } return status; }
From source file:org.apache.nutch.crawl.DeduplicationJob.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length < 1) { System.err.println(// ww w. jav a 2 s. com "Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]"); return 1; } String group = "none"; Path crawlDb = new Path(args[0]); String compareOrder = "score,fetchTime,urlLength"; for (int i = 1; i < args.length; i++) { if (args[i].equals("-group")) group = args[++i]; if (args[i].equals("-compareOrder")) { compareOrder = args[++i]; if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1 || compareOrder.indexOf("urlLength") == -1) { System.err .println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength."); return 1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DeduplicationJob: starting at " + sdf.format(start)); Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); job.setJobName("Deduplication on " + crawlDb); conf.set(DEDUPLICATION_GROUP_MODE, group); conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); job.setJarByClass(DeduplicationJob.class); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DedupReducer.class); FileSystem fs = tempDir.getFileSystem(getConf()); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "Crawl job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); throw new RuntimeException(message); } CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus"); if (g != null) { Counter counter = g.findCounter("Documents marked as duplicate"); long dups = counter.getValue(); LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates"); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e)); fs.delete(tempDir, true); return -1; } // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Deduplication: Updating status of duplicate urls into crawl db."); } Job mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(StatusUpdateReducer.class); mergeJob.setJarByClass(DeduplicationJob.class); fs = crawlDb.getFileSystem(getConf()); Path outPath = FileOutputFormat.getOutputPath(job); Path lock = CrawlDb.lock(getConf(), crawlDb, false); try { boolean success = mergeJob.waitForCompletion(true); if (!success) { String message = "Crawl job did not succeed, job status:" + mergeJob.getStatus().getState() + ", reason: " + mergeJob.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); NutchJob.cleanupAfterFailure(outPath, lock, fs); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e)); fs.delete(tempDir, true); NutchJob.cleanupAfterFailure(outPath, lock, fs); return -1; } CrawlDb.install(mergeJob, crawlDb); // clean up fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
From source file:org.apache.nutch.hostdb.ReadHostDb.java
License:Apache License
private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReadHostDb: starting at " + sdf.format(start)); Configuration conf = getConf(); conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages); conf.setBoolean(HOSTDB_DUMP_HOSTNAMES, dumpHostnames); if (expr != null) { conf.set(HOSTDB_FILTER_EXPRESSION, expr); }/*from w w w. jav a 2 s.c om*/ conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); conf.set("mapreduce.output.textoutputformat.separator", "\t"); Job job = Job.getInstance(conf); job.setJobName("ReadHostDb"); job.setJarByClass(ReadHostDb.class); FileInputFormat.addInputPath(job, new Path(hostDb, "current")); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(ReadHostDbMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "ReadHostDb job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("ReadHostDb job failed: {}", e.getMessage()); throw e; } long end = System.currentTimeMillis(); LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.hostdb.UpdateHostDb.java
License:Apache License
private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("UpdateHostDb: starting at " + sdf.format(start)); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); job.setJarByClass(UpdateHostDb.class); job.setJobName("UpdateHostDb"); FileSystem fs = hostDb.getFileSystem(conf); Path old = new Path(hostDb, "old"); Path current = new Path(hostDb, "current"); Path tempHostDb = new Path(hostDb, "hostdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // lock an existing hostdb to prevent multiple simultaneous updates Path lock = new Path(hostDb, LOCK_NAME); if (!fs.exists(current)) { fs.mkdirs(current);/* w w w . j a va 2 s .c o m*/ } LockUtil.createLockFile(fs, lock, false); MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); if (topHosts != null) { MultipleInputs.addInputPath(job, topHosts, KeyValueTextInputFormat.class); } if (crawlDb != null) { // Tell the job we read from CrawlDB conf.setBoolean("hostdb.reading.crawldb", true); MultipleInputs.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME), SequenceFileInputFormat.class); } FileOutputFormat.setOutputPath(job, tempHostDb); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HostDatum.class); job.setMapperClass(UpdateHostDbMapper.class); job.setReducerClass(UpdateHostDbReducer.class); job.setSpeculativeExecution(false); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); conf.setBoolean(HOSTDB_CHECK_FAILED, checkFailed); conf.setBoolean(HOSTDB_CHECK_NEW, checkNew); conf.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown); conf.setBoolean(HOSTDB_FORCE_CHECK, force); conf.setBoolean(HOSTDB_URL_FILTERING, filter); conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize); conf.setClassLoader(Thread.currentThread().getContextClassLoader()); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "UpdateHostDb job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); NutchJob.cleanupAfterFailure(tempHostDb, lock, fs); throw new RuntimeException(message); } FSUtils.replace(fs, old, current, true); FSUtils.replace(fs, current, tempHostDb, true); if (!preserveBackup && fs.exists(old)) fs.delete(old, true); } catch (Exception e) { LOG.error("UpdateHostDb job failed: {}", e.getMessage()); NutchJob.cleanupAfterFailure(tempHostDb, lock, fs); throw e; } LockUtil.removeLockFile(fs, lock); long end = System.currentTimeMillis(); LOG.info("UpdateHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.mapreduce.NutchUtil.java
License:Apache License
public static Map<String, Object> getJobState(Job job, String... groups) { Map<String, Object> jobState = Maps.newHashMap(); if (job == null) { return jobState; }//from w w w . ja v a2 s .c o m try { if (job.getStatus() == null || job.isRetired()) { return jobState; } } catch (IOException | InterruptedException e) { return jobState; } jobState.put("jobName", job.getJobName()); jobState.put("jobID", job.getJobID()); jobState.put(Nutch.STAT_COUNTERS, getJobCounters(job, groups)); return jobState; }
From source file:org.apache.nutch.tools.warc.WARCExporter.java
License:Apache License
public int generateWARC(String output, List<Path> segments) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("WARCExporter: starting at {}", sdf.format(start)); final Job job = NutchJob.getInstance(getConf()); job.setJobName("warc-exporter " + output); for (final Path segment : segments) { LOG.info("warc-exporter: adding segment: {}", segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); }/*w w w . j av a2 s .com*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setJarByClass(WARCMapReduce.class); job.setMapperClass(WARCMapReduce.WARCMapper.class); job.setReducerClass(WARCMapReduce.WARCReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); FileOutputFormat.setOutputPath(job, new Path(output)); // using the old api job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "WARCExporter job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); throw new RuntimeException(message); } LOG.info(job.getCounters().toString()); long end = System.currentTimeMillis(); LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("WARCExporter job failed: {}", e.getMessage()); return -1; } return 0; }
From source file:org.apache.nutch.util.CrawlCompletionStats.java
License:Apache License
public int run(String[] args) throws Exception { Option helpOpt = new Option("h", "help", false, "Show this message"); @SuppressWarnings("static-access") Option inDirs = OptionBuilder.withArgName("inputDirs").isRequired() .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")") .hasArgs().create("inputDirs"); @SuppressWarnings("static-access") Option outDir = OptionBuilder.withArgName("outputDir").isRequired() .withDescription("Output directory where results should be dumped").hasArgs().create("outputDir"); @SuppressWarnings("static-access") Option modeOpt = OptionBuilder.withArgName("mode").isRequired() .withDescription("Set statistics gathering mode (by 'host' or by 'domain')").hasArgs() .create("mode"); @SuppressWarnings("static-access") Option numReducers = OptionBuilder.withArgName("numReducers") .withDescription("Optional number of reduce jobs to use. Defaults to 1").hasArgs() .create("numReducers"); Options options = new Options(); options.addOption(helpOpt);// w w w. ja v a2 s.co m options.addOption(inDirs); options.addOption(outDir); options.addOption(modeOpt); options.addOption(numReducers); CommandLineParser parser = new GnuParser(); CommandLine cli; try { cli = parser.parse(options, args); } catch (MissingOptionException e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("CrawlCompletionStats", options, true); return 1; } if (cli.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("CrawlCompletionStats", options, true); return 1; } String inputDir = cli.getOptionValue("inputDirs"); String outputDir = cli.getOptionValue("outputDir"); int numOfReducers = 1; if (cli.hasOption("numReducers")) { numOfReducers = Integer.parseInt(args[3]); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start)); int mode = 0; String jobName = "CrawlCompletionStats"; if (cli.getOptionValue("mode").equals("host")) { jobName = "Host CrawlCompletionStats"; mode = MODE_HOST; } else if (cli.getOptionValue("mode").equals("domain")) { jobName = "Domain CrawlCompletionStats"; mode = MODE_DOMAIN; } Configuration conf = getConf(); conf.setInt("domain.statistics.mode", mode); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = Job.getInstance(conf, jobName); job.setJarByClass(CrawlCompletionStats.class); String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current"); FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); } job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(CrawlCompletionStatsMapper.class); job.setReducerClass(CrawlCompletionStatsReducer.class); job.setCombinerClass(CrawlCompletionStatsCombiner.class); job.setNumReduceTasks(numOfReducers); try { boolean success = job.waitForCompletion(true); if (!success) { String message = jobName + " job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(jobName + " job failed"); throw e; } long end = System.currentTimeMillis(); LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); return 0; }
From source file:org.apache.nutch.util.ProtocolStatusStatistics.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]"); System.err.println("\tinputDirs\tComma separated list of crawldb input directories"); System.err.println("\t\t\tE.g.: crawl/crawldb/"); System.err.println("\toutDir\t\tOutput directory where results should be dumped"); System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1."); return 1; }// www . jav a2 s. c o m String inputDir = args[0]; String outputDir = args[1]; int numOfReducers = 1; if (args.length > 3) { numOfReducers = Integer.parseInt(args[3]); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ProtocolStatistics: starting at " + sdf.format(start)); String jobName = "ProtocolStatistics"; Configuration conf = getConf(); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = Job.getInstance(conf, jobName); job.setJarByClass(ProtocolStatusStatistics.class); String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); } job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(ProtocolStatusStatisticsMapper.class); job.setReducerClass(ProtocolStatusStatisticsReducer.class); job.setCombinerClass(ProtocolStatusStatisticsCombiner.class); job.setNumReduceTasks(numOfReducers); try { boolean success = job.waitForCompletion(true); if (!success) { String message = jobName + " job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(jobName + " job failed", e); throw e; } long end = System.currentTimeMillis(); LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
From source file:org.apache.nutch.util.SitemapProcessor.java
License:Apache License
public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); }//from w ww.ja va 2 s.c o m FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); Path current = new Path(crawldb, "current"); Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // lock an existing crawldb to prevent multiple simultaneous updates Path lock = new Path(crawldb, LOCK_NAME); if (!fs.exists(current)) fs.mkdirs(current); LockUtil.createLockFile(fs, lock, false); Configuration conf = getConf(); conf.setBoolean(SITEMAP_STRICT_PARSING, strict); conf.setBoolean(SITEMAP_URL_FILTERING, filter); conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString()); job.setJarByClass(SitemapProcessor.class); // add crawlDb, sitemap url directory and hostDb to input paths MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); if (sitemapUrlDir != null) MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class); if (hostdb != null) MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempCrawlDb); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, SitemapMapper.class); MultithreadedMapper.setNumberOfThreads(job, threads); job.setReducerClass(SitemapReducer.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (!preserveBackup && fs.exists(old)) fs.delete(old, true); else FSUtils.replace(fs, old, current, true); FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); if (LOG.isInfoEnabled()) { long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds); LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); long end = System.currentTimeMillis(); LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); throw e; } }