Example usage for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException

Source Link

Document

Gets the counters for this job.

Usage

From source file:org.apache.mrql.MapReduceOperation.java

License:Apache License

/**
 * The MapReduce physical operator/*from w  ww  .j a v a  2  s  . c  o  m*/
 * @param map_fnc          the mapper function
 * @param combine_fnc      optional in-mapper combiner function
 * @param reduce_fnc       the reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param source           the input data source
 * @param num_reduces      number of reducers
 * @param stop_counter     optional counter used in repeat operation
 * @param orderp           does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce(Tree map_fnc, // mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet source, // input data source
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper", map_fnc.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) { // will use in-mapper combiner
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(source, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerPartitioner.class);
    job.setSortComparatorClass(MRContainerKeyComparator.class);
    job.setGroupingComparatorClass(MRContainerKeyComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p : source.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setReducerClass(MRReducer.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce reducer");
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.mrql.MapReducePlan.java

License:Apache License

/** find the number of records in the hadoop MapReduce job output */
public final static long outputRecords(Job job) throws Exception {
    CounterGroup cg = job.getCounters().getGroup("org.apache.hadoop.mapred.Task$Counter");
    long rc = cg.findCounter("REDUCE_OUTPUT_RECORDS").getValue();
    if (rc == 0)//w w  w  . j a  v  a  2  s.c  o m
        return cg.findCounter("MAP_OUTPUT_RECORDS").getValue();
    return rc;
}

From source file:org.apache.nutch.crawl.DeduplicationJob.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println(//from w w w .  j a v  a  2 s  .  c  o  m
                "Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
        return 1;
    }

    String group = "none";
    Path crawlDb = new Path(args[0]);
    String compareOrder = "score,fetchTime,urlLength";

    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-group"))
            group = args[++i];
        if (args[i].equals("-compareOrder")) {
            compareOrder = args[++i];

            if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1
                    || compareOrder.indexOf("urlLength") == -1) {
                System.err
                        .println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
                return 1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeduplicationJob: starting at " + sdf.format(start));

    Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job job = NutchJob.getInstance(getConf());
    Configuration conf = job.getConfiguration();
    job.setJobName("Deduplication on " + crawlDb);
    conf.set(DEDUPLICATION_GROUP_MODE, group);
    conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
    job.setJarByClass(DeduplicationJob.class);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);

    FileSystem fs = tempDir.getFileSystem(getConf());
    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "Crawl job did not succeed, job status:" + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            fs.delete(tempDir, true);
            throw new RuntimeException(message);
        }
        CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
        if (g != null) {
            Counter counter = g.findCounter("Documents marked as duplicate");
            long dups = counter.getValue();
            LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates");
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        return -1;
    }

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
    }

    Job mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);
    mergeJob.setJarByClass(DeduplicationJob.class);

    fs = crawlDb.getFileSystem(getConf());
    Path outPath = FileOutputFormat.getOutputPath(job);
    Path lock = CrawlDb.lock(getConf(), crawlDb, false);
    try {
        boolean success = mergeJob.waitForCompletion(true);
        if (!success) {
            String message = "Crawl job did not succeed, job status:" + mergeJob.getStatus().getState()
                    + ", reason: " + mergeJob.getStatus().getFailureInfo();
            LOG.error(message);
            fs.delete(tempDir, true);
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        NutchJob.cleanupAfterFailure(outPath, lock, fs);
        return -1;
    }

    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));

    return 0;
}

From source file:org.apache.nutch.mapreduce.NutchUtil.java

License:Apache License

public static Map<String, Object> getJobCounters(Job job, String... groups) {
    Map<String, Object> counters = Maps.newHashMap();
    if (job == null) {
        return counters;
    }/*from   w ww.  ja  v a 2 s  . co  m*/

    try {
        for (CounterGroup group : job.getCounters()) {
            String groupName = group.getDisplayName();

            if (ArrayUtils.isEmpty(groups) || ArrayUtils.contains(groups, groupName)) {
                Map<String, Object> groupedCounters = Maps.newHashMap();

                for (Counter counter : group) {
                    groupedCounters.put(counter.getName(), counter.getValue());
                }

                counters.put(groupName, groupedCounters);
            }
        }
    } catch (Exception e) {
        counters.put("error", e.toString());
    }

    return counters;
}

From source file:org.apache.nutch.tools.warc.WARCExporter.java

License:Apache License

public int generateWARC(String output, List<Path> segments) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("WARCExporter: starting at {}", sdf.format(start));

    final Job job = NutchJob.getInstance(getConf());
    job.setJobName("warc-exporter " + output);

    for (final Path segment : segments) {
        LOG.info("warc-exporter: adding segment: {}", segment);
        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    }//  w  w w  .j a  v a 2  s .  c  o  m

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setJarByClass(WARCMapReduce.class);
    job.setMapperClass(WARCMapReduce.WARCMapper.class);
    job.setReducerClass(WARCMapReduce.WARCReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(output));
    // using the old api
    job.setOutputFormatClass(WARCOutputFormat.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "WARCExporter job did not succeed, job status:" + job.getStatus().getState()
                    + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            throw new RuntimeException(message);
        }
        LOG.info(job.getCounters().toString());
        long end = System.currentTimeMillis();
        LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end),
                TimingUtil.elapsedTime(start, end));
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("WARCExporter job failed: {}", e.getMessage());
        return -1;
    }

    return 0;
}

From source file:org.apache.nutch.util.SitemapProcessor.java

License:Apache License

public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter,
        boolean normalize, int threads) throws Exception {
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("SitemapProcessor: Starting at {}", sdf.format(start));
    }//  ww  w. ja v a2s  .c o  m

    FileSystem fs = crawldb.getFileSystem(getConf());
    Path old = new Path(crawldb, "old");
    Path current = new Path(crawldb, "current");
    Path tempCrawlDb = new Path(crawldb,
            "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // lock an existing crawldb to prevent multiple simultaneous updates
    Path lock = new Path(crawldb, LOCK_NAME);
    if (!fs.exists(current))
        fs.mkdirs(current);

    LockUtil.createLockFile(fs, lock, false);

    Configuration conf = getConf();
    conf.setBoolean(SITEMAP_STRICT_PARSING, strict);
    conf.setBoolean(SITEMAP_URL_FILTERING, filter);
    conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
    job.setJarByClass(SitemapProcessor.class);

    // add crawlDb, sitemap url directory and hostDb to input paths
    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);

    if (sitemapUrlDir != null)
        MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class);

    if (hostdb != null)
        MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, tempCrawlDb);

    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MultithreadedMapper.class);
    MultithreadedMapper.setMapperClass(job, SitemapMapper.class);
    MultithreadedMapper.setNumberOfThreads(job, threads);
    job.setReducerClass(SitemapReducer.class);

    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: "
                    + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo();
            LOG.error(message);
            NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
            // throw exception so that calling routine can exit with error
            throw new RuntimeException(message);
        }

        boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
        if (!preserveBackup && fs.exists(old))
            fs.delete(old, true);
        else
            FSUtils.replace(fs, old, current, true);

        FSUtils.replace(fs, current, tempCrawlDb, true);
        LockUtil.removeLockFile(fs, lock);

        if (LOG.isInfoEnabled()) {
            long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
            long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
            long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
            long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
            long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();

            LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
            LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
            LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
            LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
            LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);

            long end = System.currentTimeMillis();
            LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end),
                    TimingUtil.elapsedTime(start, end));
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("SitemapProcessor_" + crawldb.toString(), e);
        NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
        throw e;
    }
}

From source file:org.apache.nutch.util.ToolUtil.java

License:Apache License

@SuppressWarnings("unchecked")
public static final void recordJobStatus(String label, Job job, Map<String, Object> results) {
    Map<String, Object> jobs = (Map<String, Object>) results.get(Nutch.STAT_JOBS);
    if (jobs == null) {
        jobs = new LinkedHashMap<String, Object>();
        results.put(Nutch.STAT_JOBS, jobs);
    }//from  w  w  w.  j a  v  a  2 s.  c  o  m
    Map<String, Object> stats = new HashMap<String, Object>();
    Map<String, Object> countStats = new HashMap<String, Object>();
    try {
        Counters counters = job.getCounters();
        for (CounterGroup cg : counters) {
            Map<String, Object> cnts = new HashMap<String, Object>();
            countStats.put(cg.getDisplayName(), cnts);
            for (Counter c : cg) {
                cnts.put(c.getName(), c.getValue());
            }
        }
    } catch (Exception e) {
        countStats.put("error", e.toString());
    }
    stats.put(Nutch.STAT_COUNTERS, countStats);
    stats.put("jobName", job.getJobName());
    stats.put("jobID", job.getJobID());
    if (label == null) {
        label = job.getJobName();
        if (job.getJobID() != null) {
            label = label + "-" + job.getJobID();
        }
    }
    jobs.put(label, stats);
}

From source file:org.apache.parquet.hadoop.example.TestInputOutputFormat.java

License:Apache License

private static long value(Job job, String groupName, String name) throws Exception {
    // getGroup moved to AbstractCounters
    Method getGroup = org.apache.hadoop.mapreduce.Counters.class.getMethod("getGroup", String.class);
    // CounterGroup changed to an interface
    Method findCounter = org.apache.hadoop.mapreduce.CounterGroup.class.getMethod("findCounter", String.class);
    // Counter changed to an interface
    Method getValue = org.apache.hadoop.mapreduce.Counter.class.getMethod("getValue");
    CounterGroup group = (CounterGroup) getGroup.invoke(job.getCounters(), groupName);
    Counter counter = (Counter) findCounter.invoke(group, name);
    return (Long) getValue.invoke(counter);
}

From source file:org.apache.phoenix.end2end.IndexScrutinyToolIT.java

License:Apache License

/**
 * Tests a data table that is correctly indexed. Scrutiny should report all rows as valid.
 */// ww w  .  j a v a  2  s .c  om
@Test
public void testValidIndex() throws Exception {
    // insert two rows
    upsertRow(dataTableUpsertStmt, 1, "name-1", 94010);
    upsertRow(dataTableUpsertStmt, 2, "name-2", 95123);
    conn.commit();

    int numDataRows = countRows(dataTableFullName);
    int numIndexRows = countRows(indexTableFullName);

    // scrutiny should report everything as ok
    List<Job> completedJobs = runScrutiny(schemaName, dataTableName, indexTableName);
    Job job = completedJobs.get(0);
    assertTrue(job.isSuccessful());
    Counters counters = job.getCounters();
    assertEquals(2, getCounterValue(counters, VALID_ROW_COUNT));
    assertEquals(0, getCounterValue(counters, INVALID_ROW_COUNT));

    // make sure row counts weren't modified by scrutiny
    assertEquals(numDataRows, countRows(dataTableFullName));
    assertEquals(numIndexRows, countRows(indexTableFullName));
}

From source file:org.apache.phoenix.end2end.IndexScrutinyToolIT.java

License:Apache License

/**
 * Tests an index with the same # of rows as the data table, but one of the index rows is
 * incorrect Scrutiny should report the invalid rows.
 *///from w  ww . ja  v  a2 s  .c o  m
@Test
public void testEqualRowCountIndexIncorrect() throws Exception {
    // insert one valid row
    upsertRow(dataTableUpsertStmt, 1, "name-1", 94010);
    conn.commit();

    // disable the index and insert another row which is not indexed
    disableIndex();
    upsertRow(dataTableUpsertStmt, 2, "name-2", 95123);
    conn.commit();

    // insert a bad row into the index
    upsertIndexRow("badName", 2, 9999);
    conn.commit();

    // scrutiny should report the bad row
    List<Job> completedJobs = runScrutiny(schemaName, dataTableName, indexTableName);
    Job job = completedJobs.get(0);
    assertTrue(job.isSuccessful());
    Counters counters = job.getCounters();
    assertEquals(1, getCounterValue(counters, VALID_ROW_COUNT));
    assertEquals(1, getCounterValue(counters, INVALID_ROW_COUNT));
}