Example usage for org.apache.hadoop.mapreduce Job getStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getStatus.

Prototype

public JobStatus getStatus() throws IOException, InterruptedException

Source Link

Usage

From source file:org.apache.ignite.internal.processors.hadoop.impl.client.HadoopClientProtocolSelfTest.java

License:Apache License

/**
 * Test job submission.//from  w  w  w .  j  av  a  2s. c  o m
 *
 * @param noCombiners Whether there are no combiners.
 * @param noReducers Whether there are no reducers.
 * @throws Exception If failed.
 */
public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception {
    IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName);

    igfs.mkdirs(new IgfsPath(PATH_INPUT));

    try (BufferedWriter bw = new BufferedWriter(
            new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) {

        bw.write("word");
    }

    Configuration conf = config(HadoopAbstractSelfTest.REST_PORT);

    final Job job = Job.getInstance(conf);

    try {
        job.setJobName(JOB_NAME);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(TestMapper.class);
        job.setReducerClass(TestReducer.class);

        if (!noCombiners)
            job.setCombinerClass(TestCombiner.class);

        if (noReducers)
            job.setNumReduceTasks(0);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TestOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(PATH_INPUT));
        FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT));

        job.submit();

        JobID jobId = job.getJobID();

        // Setup phase.
        JobStatus jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f;
        assert jobStatus.getMapProgress() == 0.0f;
        assert jobStatus.getReduceProgress() == 0.0f;

        U.sleep(2100);

        JobStatus recentJobStatus = job.getStatus();

        assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old="
                + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress();

        // Transferring to map phase.
        setupLockFile.delete();

        assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
            @Override
            public boolean apply() {
                try {
                    return F.eq(1.0f, job.getStatus().getSetupProgress());
                } catch (Exception e) {
                    throw new RuntimeException("Unexpected exception.", e);
                }
            }
        }, 5000L);

        // Map phase.
        jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f;
        assert jobStatus.getReduceProgress() == 0.0f;

        U.sleep(2100);

        recentJobStatus = job.getStatus();

        assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old="
                + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress();

        // Transferring to reduce phase.
        mapLockFile.delete();

        assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
            @Override
            public boolean apply() {
                try {
                    return F.eq(1.0f, job.getStatus().getMapProgress());
                } catch (Exception e) {
                    throw new RuntimeException("Unexpected exception.", e);
                }
            }
        }, 5000L);

        if (!noReducers) {
            // Reduce phase.
            jobStatus = job.getStatus();
            checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
            assert jobStatus.getSetupProgress() == 1.0f;
            assert jobStatus.getMapProgress() == 1.0f;
            assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f;

            // Ensure that reduces progress increases.
            U.sleep(2100);

            recentJobStatus = job.getStatus();

            assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old="
                    + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress();

            reduceLockFile.delete();
        }

        job.waitForCompletion(false);

        jobStatus = job.getStatus();
        checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() == 1.0f;
        assert jobStatus.getReduceProgress() == 1.0f;

        dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT));
    } finally {
        job.getCluster().close();
    }
}

From source file:org.apache.kylin.engine.mr.common.HadoopJobStatusChecker.java

License:Apache License

public static JobStepStatusEnum checkStatus(Job job, StringBuilder output) {
    if (job == null || job.getJobID() == null) {
        output.append("Skip status check with empty job id..\n");
        return JobStepStatusEnum.WAITING;
    }// ww w .  j  ava  2  s .  co m

    JobStepStatusEnum status = null;
    try {
        switch (job.getStatus().getState()) {
        case SUCCEEDED:
            status = JobStepStatusEnum.FINISHED;
            break;
        case FAILED:
            status = JobStepStatusEnum.ERROR;
            break;
        case KILLED:
            status = JobStepStatusEnum.KILLED;
            break;
        case RUNNING:
            status = JobStepStatusEnum.RUNNING;
            break;
        case PREP:
            status = JobStepStatusEnum.WAITING;
            break;
        default:
            throw new IllegalStateException();
        }
    } catch (Exception e) {
        logger.error("error check status", e);
        output.append("Exception: " + e.getLocalizedMessage() + "\n");
        status = JobStepStatusEnum.ERROR;
    }

    return status;
}

From source file:org.apache.nutch.crawl.DeduplicationJob.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println(//  ww w.  jav a  2 s.  com
                "Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
        return 1;
    }

    String group = "none";
    Path crawlDb = new Path(args[0]);
    String compareOrder = "score,fetchTime,urlLength";

    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-group"))
            group = args[++i];
        if (args[i].equals("-compareOrder")) {
            compareOrder = args[++i];

            if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1
                    || compareOrder.indexOf("urlLength") == -1) {
                System.err
                        .println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
                return 1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeduplicationJob: starting at " + sdf.format(start));

    Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job job = NutchJob.getInstance(getConf());
    Configuration conf = job.getConfiguration();
    job.setJobName("Deduplication on " + crawlDb);
    conf.set(DEDUPLICATION_GROUP_MODE, group);
    conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
    job.setJarByClass(DeduplicationJob.class);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);

    FileSystem fs = tempDir.getFileSystem(getConf());
    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "Crawl job did not succeed, job status:" + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            fs.delete(tempDir, true);
            throw new RuntimeException(message);
        }
        CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
        if (g != null) {
            Counter counter = g.findCounter("Documents marked as duplicate");
            long dups = counter.getValue();
            LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates");
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        return -1;
    }

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
    }

    Job mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);
    mergeJob.setJarByClass(DeduplicationJob.class);

    fs = crawlDb.getFileSystem(getConf());
    Path outPath = FileOutputFormat.getOutputPath(job);
    Path lock = CrawlDb.lock(getConf(), crawlDb, false);
    try {
        boolean success = mergeJob.waitForCompletion(true);
        if (!success) {
            String message = "Crawl job did not succeed, job status:" + mergeJob.getStatus().getState()
                    + ", reason: " + mergeJob.getStatus().getFailureInfo();
            LOG.error(message);
            fs.delete(tempDir, true);
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        NutchJob.cleanupAfterFailure(outPath, lock, fs);
        return -1;
    }

    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));

    return 0;
}

From source file:org.apache.nutch.hostdb.ReadHostDb.java

License:Apache License

private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr)
        throws Exception {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("ReadHostDb: starting at " + sdf.format(start));

    Configuration conf = getConf();
    conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages);
    conf.setBoolean(HOSTDB_DUMP_HOSTNAMES, dumpHostnames);
    if (expr != null) {
        conf.set(HOSTDB_FILTER_EXPRESSION, expr);
    }/*from   w  w  w.  jav a  2 s.c  om*/
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    conf.set("mapreduce.output.textoutputformat.separator", "\t");

    Job job = Job.getInstance(conf);
    job.setJobName("ReadHostDb");
    job.setJarByClass(ReadHostDb.class);

    FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(ReadHostDbMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(0);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "ReadHostDb job did not succeed, job status: " + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("ReadHostDb job failed: {}", e.getMessage());
        throw e;
    }

    long end = System.currentTimeMillis();
    LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.hostdb.UpdateHostDb.java

License:Apache License

private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew,
        boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("UpdateHostDb: starting at " + sdf.format(start));

    Job job = NutchJob.getInstance(getConf());
    Configuration conf = job.getConfiguration();
    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
    job.setJarByClass(UpdateHostDb.class);
    job.setJobName("UpdateHostDb");

    FileSystem fs = hostDb.getFileSystem(conf);
    Path old = new Path(hostDb, "old");
    Path current = new Path(hostDb, "current");
    Path tempHostDb = new Path(hostDb, "hostdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // lock an existing hostdb to prevent multiple simultaneous updates
    Path lock = new Path(hostDb, LOCK_NAME);
    if (!fs.exists(current)) {
        fs.mkdirs(current);/*  w w  w  .  j a va 2  s .c o m*/
    }
    LockUtil.createLockFile(fs, lock, false);

    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);

    if (topHosts != null) {
        MultipleInputs.addInputPath(job, topHosts, KeyValueTextInputFormat.class);
    }
    if (crawlDb != null) {
        // Tell the job we read from CrawlDB
        conf.setBoolean("hostdb.reading.crawldb", true);
        MultipleInputs.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME),
                SequenceFileInputFormat.class);
    }

    FileOutputFormat.setOutputPath(job, tempHostDb);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(HostDatum.class);
    job.setMapperClass(UpdateHostDbMapper.class);
    job.setReducerClass(UpdateHostDbReducer.class);
    job.setSpeculativeExecution(false);

    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    conf.setBoolean(HOSTDB_CHECK_FAILED, checkFailed);
    conf.setBoolean(HOSTDB_CHECK_NEW, checkNew);
    conf.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown);
    conf.setBoolean(HOSTDB_FORCE_CHECK, force);
    conf.setBoolean(HOSTDB_URL_FILTERING, filter);
    conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
    conf.setClassLoader(Thread.currentThread().getContextClassLoader());

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "UpdateHostDb job did not succeed, job status:" + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            NutchJob.cleanupAfterFailure(tempHostDb, lock, fs);
            throw new RuntimeException(message);
        }

        FSUtils.replace(fs, old, current, true);
        FSUtils.replace(fs, current, tempHostDb, true);

        if (!preserveBackup && fs.exists(old))
            fs.delete(old, true);
    } catch (Exception e) {
        LOG.error("UpdateHostDb job failed: {}", e.getMessage());
        NutchJob.cleanupAfterFailure(tempHostDb, lock, fs);
        throw e;
    }

    LockUtil.removeLockFile(fs, lock);
    long end = System.currentTimeMillis();
    LOG.info("UpdateHostDb: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.mapreduce.NutchUtil.java

License:Apache License

public static Map<String, Object> getJobState(Job job, String... groups) {
    Map<String, Object> jobState = Maps.newHashMap();
    if (job == null) {
        return jobState;
    }//from   w  w  w  .  ja  v a2  s  .c o m

    try {
        if (job.getStatus() == null || job.isRetired()) {
            return jobState;
        }
    } catch (IOException | InterruptedException e) {
        return jobState;
    }

    jobState.put("jobName", job.getJobName());
    jobState.put("jobID", job.getJobID());

    jobState.put(Nutch.STAT_COUNTERS, getJobCounters(job, groups));

    return jobState;
}

From source file:org.apache.nutch.tools.warc.WARCExporter.java

License:Apache License

public int generateWARC(String output, List<Path> segments) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("WARCExporter: starting at {}", sdf.format(start));

    final Job job = NutchJob.getInstance(getConf());
    job.setJobName("warc-exporter " + output);

    for (final Path segment : segments) {
        LOG.info("warc-exporter: adding segment: {}", segment);
        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    }/*w  w  w  .  j av a2  s  .com*/

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setJarByClass(WARCMapReduce.class);
    job.setMapperClass(WARCMapReduce.WARCMapper.class);
    job.setReducerClass(WARCMapReduce.WARCReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(output));
    // using the old api
    job.setOutputFormatClass(WARCOutputFormat.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "WARCExporter job did not succeed, job status:" + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            throw new RuntimeException(message);
        }
        LOG.info(job.getCounters().toString());
        long end = System.currentTimeMillis();
        LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end),
                TimingUtil.elapsedTime(start, end));
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("WARCExporter job failed: {}", e.getMessage());
        return -1;
    }

    return 0;
}

From source file:org.apache.nutch.util.CrawlCompletionStats.java

License:Apache License

public int run(String[] args) throws Exception {
    Option helpOpt = new Option("h", "help", false, "Show this message");
    @SuppressWarnings("static-access")
    Option inDirs = OptionBuilder.withArgName("inputDirs").isRequired()
            .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
            .hasArgs().create("inputDirs");
    @SuppressWarnings("static-access")
    Option outDir = OptionBuilder.withArgName("outputDir").isRequired()
            .withDescription("Output directory where results should be dumped").hasArgs().create("outputDir");
    @SuppressWarnings("static-access")
    Option modeOpt = OptionBuilder.withArgName("mode").isRequired()
            .withDescription("Set statistics gathering mode (by 'host' or by 'domain')").hasArgs()
            .create("mode");
    @SuppressWarnings("static-access")
    Option numReducers = OptionBuilder.withArgName("numReducers")
            .withDescription("Optional number of reduce jobs to use. Defaults to 1").hasArgs()
            .create("numReducers");

    Options options = new Options();
    options.addOption(helpOpt);// w  w  w.  ja v a2  s.co m
    options.addOption(inDirs);
    options.addOption(outDir);
    options.addOption(modeOpt);
    options.addOption(numReducers);

    CommandLineParser parser = new GnuParser();
    CommandLine cli;

    try {
        cli = parser.parse(options, args);
    } catch (MissingOptionException e) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("CrawlCompletionStats", options, true);
        return 1;
    }

    if (cli.hasOption("help")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("CrawlCompletionStats", options, true);
        return 1;
    }

    String inputDir = cli.getOptionValue("inputDirs");
    String outputDir = cli.getOptionValue("outputDir");

    int numOfReducers = 1;
    if (cli.hasOption("numReducers")) {
        numOfReducers = Integer.parseInt(args[3]);
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start));

    int mode = 0;
    String jobName = "CrawlCompletionStats";
    if (cli.getOptionValue("mode").equals("host")) {
        jobName = "Host CrawlCompletionStats";
        mode = MODE_HOST;
    } else if (cli.getOptionValue("mode").equals("domain")) {
        jobName = "Domain CrawlCompletionStats";
        mode = MODE_DOMAIN;
    }

    Configuration conf = getConf();
    conf.setInt("domain.statistics.mode", mode);
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(CrawlCompletionStats.class);

    String[] inputDirsSpecs = inputDir.split(",");
    for (int i = 0; i < inputDirsSpecs.length; i++) {
        File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
        FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));

    }

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(CrawlCompletionStatsMapper.class);
    job.setReducerClass(CrawlCompletionStatsReducer.class);
    job.setCombinerClass(CrawlCompletionStatsCombiner.class);
    job.setNumReduceTasks(numOfReducers);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = jobName + " job did not succeed, job status: " + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error(jobName + " job failed");
        throw e;
    }

    long end = System.currentTimeMillis();
    LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}", sdf.format(end),
            TimingUtil.elapsedTime(start, end));
    return 0;
}

From source file:org.apache.nutch.util.ProtocolStatusStatistics.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]");

        System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
        System.err.println("\t\t\tE.g.: crawl/crawldb/");

        System.err.println("\toutDir\t\tOutput directory where results should be dumped");

        System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
        return 1;
    }// www .  jav a2  s. c o  m
    String inputDir = args[0];
    String outputDir = args[1];

    int numOfReducers = 1;

    if (args.length > 3) {
        numOfReducers = Integer.parseInt(args[3]);
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("ProtocolStatistics: starting at " + sdf.format(start));

    String jobName = "ProtocolStatistics";

    Configuration conf = getConf();
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(ProtocolStatusStatistics.class);

    String[] inputDirsSpecs = inputDir.split(",");
    for (int i = 0; i < inputDirsSpecs.length; i++) {
        File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
        FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
    }

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(ProtocolStatusStatisticsMapper.class);
    job.setReducerClass(ProtocolStatusStatisticsReducer.class);
    job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
    job.setNumReduceTasks(numOfReducers);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = jobName + " job did not succeed, job status: " + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error(jobName + " job failed", e);
        throw e;
    }

    long end = System.currentTimeMillis();
    LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
    return 0;
}

From source file:org.apache.nutch.util.SitemapProcessor.java

License:Apache License

public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter,
        boolean normalize, int threads) throws Exception {
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("SitemapProcessor: Starting at {}", sdf.format(start));
    }//from   w ww.ja  va 2  s.c  o  m

    FileSystem fs = crawldb.getFileSystem(getConf());
    Path old = new Path(crawldb, "old");
    Path current = new Path(crawldb, "current");
    Path tempCrawlDb = new Path(crawldb,
            "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // lock an existing crawldb to prevent multiple simultaneous updates
    Path lock = new Path(crawldb, LOCK_NAME);
    if (!fs.exists(current))
        fs.mkdirs(current);

    LockUtil.createLockFile(fs, lock, false);

    Configuration conf = getConf();
    conf.setBoolean(SITEMAP_STRICT_PARSING, strict);
    conf.setBoolean(SITEMAP_URL_FILTERING, filter);
    conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
    job.setJarByClass(SitemapProcessor.class);

    // add crawlDb, sitemap url directory and hostDb to input paths
    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);

    if (sitemapUrlDir != null)
        MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class);

    if (hostdb != null)
        MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempCrawlDb);

    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MultithreadedMapper.class);
    MultithreadedMapper.setMapperClass(job, SitemapMapper.class);
    MultithreadedMapper.setNumberOfThreads(job, threads);
    job.setReducerClass(SitemapReducer.class);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: "
                    + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }

        boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
        if (!preserveBackup && fs.exists(old))
            fs.delete(old, true);
        else
            FSUtils.replace(fs, old, current, true);

        FSUtils.replace(fs, current, tempCrawlDb, true);
        LockUtil.removeLockFile(fs, lock);

        if (LOG.isInfoEnabled()) {
            long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
            long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
            long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
            long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
            long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();

            LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
            LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
            LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
            LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
            LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);

            long end = System.currentTimeMillis();
            LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end),
                    TimingUtil.elapsedTime(start, end));
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("SitemapProcessor_" + crawldb.toString(), e);
        NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
        throw e;
    }
}