List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:org.apache.mrql.MapReduceOperation.java
License:Apache License
/** * The MapReduce physical operator/*from w ww .j a v a 2 s . c o m*/ * @param map_fnc the mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc the reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param source the input data source * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce(Tree map_fnc, // mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet source, // input data source int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper", map_fnc.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { // will use in-mapper combiner conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(source, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerPartitioner.class); job.setSortComparatorClass(MRContainerKeyComparator.class); job.setGroupingComparatorClass(MRContainerKeyComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (DataSource p : source.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setReducerClass(MRReducer.class); if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc)) System.out.println("Streamed MapReduce reducer"); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.mrql.MapReducePlan.java
License:Apache License
/** find the number of records in the hadoop MapReduce job output */ public final static long outputRecords(Job job) throws Exception { CounterGroup cg = job.getCounters().getGroup("org.apache.hadoop.mapred.Task$Counter"); long rc = cg.findCounter("REDUCE_OUTPUT_RECORDS").getValue(); if (rc == 0)//w w w . j a v a 2 s.c o m return cg.findCounter("MAP_OUTPUT_RECORDS").getValue(); return rc; }
From source file:org.apache.nutch.crawl.DeduplicationJob.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length < 1) { System.err.println(//from w w w . j a v a 2 s . c o m "Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]"); return 1; } String group = "none"; Path crawlDb = new Path(args[0]); String compareOrder = "score,fetchTime,urlLength"; for (int i = 1; i < args.length; i++) { if (args[i].equals("-group")) group = args[++i]; if (args[i].equals("-compareOrder")) { compareOrder = args[++i]; if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1 || compareOrder.indexOf("urlLength") == -1) { System.err .println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength."); return 1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DeduplicationJob: starting at " + sdf.format(start)); Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); job.setJobName("Deduplication on " + crawlDb); conf.set(DEDUPLICATION_GROUP_MODE, group); conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); job.setJarByClass(DeduplicationJob.class); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(DBFilter.class); job.setReducerClass(DedupReducer.class); FileSystem fs = tempDir.getFileSystem(getConf()); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "Crawl job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); throw new RuntimeException(message); } CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus"); if (g != null) { Counter counter = g.findCounter("Documents marked as duplicate"); long dups = counter.getValue(); LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates"); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e)); fs.delete(tempDir, true); return -1; } // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Deduplication: Updating status of duplicate urls into crawl db."); } Job mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(StatusUpdateReducer.class); mergeJob.setJarByClass(DeduplicationJob.class); fs = crawlDb.getFileSystem(getConf()); Path outPath = FileOutputFormat.getOutputPath(job); Path lock = CrawlDb.lock(getConf(), crawlDb, false); try { boolean success = mergeJob.waitForCompletion(true); if (!success) { String message = "Crawl job did not succeed, job status:" + mergeJob.getStatus().getState() + ", reason: " + mergeJob.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); NutchJob.cleanupAfterFailure(outPath, lock, fs); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e)); fs.delete(tempDir, true); NutchJob.cleanupAfterFailure(outPath, lock, fs); return -1; } CrawlDb.install(mergeJob, crawlDb); // clean up fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
From source file:org.apache.nutch.mapreduce.NutchUtil.java
License:Apache License
public static Map<String, Object> getJobCounters(Job job, String... groups) { Map<String, Object> counters = Maps.newHashMap(); if (job == null) { return counters; }/*from w ww. ja v a 2 s . co m*/ try { for (CounterGroup group : job.getCounters()) { String groupName = group.getDisplayName(); if (ArrayUtils.isEmpty(groups) || ArrayUtils.contains(groups, groupName)) { Map<String, Object> groupedCounters = Maps.newHashMap(); for (Counter counter : group) { groupedCounters.put(counter.getName(), counter.getValue()); } counters.put(groupName, groupedCounters); } } } catch (Exception e) { counters.put("error", e.toString()); } return counters; }
From source file:org.apache.nutch.tools.warc.WARCExporter.java
License:Apache License
public int generateWARC(String output, List<Path> segments) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("WARCExporter: starting at {}", sdf.format(start)); final Job job = NutchJob.getInstance(getConf()); job.setJobName("warc-exporter " + output); for (final Path segment : segments) { LOG.info("warc-exporter: adding segment: {}", segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); }// w w w .j a v a 2 s . c o m job.setInputFormatClass(SequenceFileInputFormat.class); job.setJarByClass(WARCMapReduce.class); job.setMapperClass(WARCMapReduce.WARCMapper.class); job.setReducerClass(WARCMapReduce.WARCReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); FileOutputFormat.setOutputPath(job, new Path(output)); // using the old api job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "WARCExporter job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); throw new RuntimeException(message); } LOG.info(job.getCounters().toString()); long end = System.currentTimeMillis(); LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("WARCExporter job failed: {}", e.getMessage()); return -1; } return 0; }
From source file:org.apache.nutch.util.SitemapProcessor.java
License:Apache License
public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); }// ww w. ja v a2s .c o m FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); Path current = new Path(crawldb, "current"); Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // lock an existing crawldb to prevent multiple simultaneous updates Path lock = new Path(crawldb, LOCK_NAME); if (!fs.exists(current)) fs.mkdirs(current); LockUtil.createLockFile(fs, lock, false); Configuration conf = getConf(); conf.setBoolean(SITEMAP_STRICT_PARSING, strict); conf.setBoolean(SITEMAP_URL_FILTERING, filter); conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString()); job.setJarByClass(SitemapProcessor.class); // add crawlDb, sitemap url directory and hostDb to input paths MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); if (sitemapUrlDir != null) MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class); if (hostdb != null) MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, tempCrawlDb); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, SitemapMapper.class); MultithreadedMapper.setNumberOfThreads(job, threads); job.setReducerClass(SitemapReducer.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "SitemapProcessor_" + crawldb.toString() + " job did not succeed, job status: " + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); // throw exception so that calling routine can exit with error throw new RuntimeException(message); } boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (!preserveBackup && fs.exists(old)) fs.delete(old, true); else FSUtils.replace(fs, old, current, true); FSUtils.replace(fs, current, tempCrawlDb, true); LockUtil.removeLockFile(fs, lock); if (LOG.isInfoEnabled()) { long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue(); long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue(); long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue(); long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue(); long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue(); LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords); LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname); LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds); LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); long end = System.currentTimeMillis(); LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); throw e; } }
From source file:org.apache.nutch.util.ToolUtil.java
License:Apache License
@SuppressWarnings("unchecked") public static final void recordJobStatus(String label, Job job, Map<String, Object> results) { Map<String, Object> jobs = (Map<String, Object>) results.get(Nutch.STAT_JOBS); if (jobs == null) { jobs = new LinkedHashMap<String, Object>(); results.put(Nutch.STAT_JOBS, jobs); }//from w w w. j a v a 2 s. c o m Map<String, Object> stats = new HashMap<String, Object>(); Map<String, Object> countStats = new HashMap<String, Object>(); try { Counters counters = job.getCounters(); for (CounterGroup cg : counters) { Map<String, Object> cnts = new HashMap<String, Object>(); countStats.put(cg.getDisplayName(), cnts); for (Counter c : cg) { cnts.put(c.getName(), c.getValue()); } } } catch (Exception e) { countStats.put("error", e.toString()); } stats.put(Nutch.STAT_COUNTERS, countStats); stats.put("jobName", job.getJobName()); stats.put("jobID", job.getJobID()); if (label == null) { label = job.getJobName(); if (job.getJobID() != null) { label = label + "-" + job.getJobID(); } } jobs.put(label, stats); }
From source file:org.apache.parquet.hadoop.example.TestInputOutputFormat.java
License:Apache License
private static long value(Job job, String groupName, String name) throws Exception { // getGroup moved to AbstractCounters Method getGroup = org.apache.hadoop.mapreduce.Counters.class.getMethod("getGroup", String.class); // CounterGroup changed to an interface Method findCounter = org.apache.hadoop.mapreduce.CounterGroup.class.getMethod("findCounter", String.class); // Counter changed to an interface Method getValue = org.apache.hadoop.mapreduce.Counter.class.getMethod("getValue"); CounterGroup group = (CounterGroup) getGroup.invoke(job.getCounters(), groupName); Counter counter = (Counter) findCounter.invoke(group, name); return (Long) getValue.invoke(counter); }
From source file:org.apache.phoenix.end2end.IndexScrutinyToolIT.java
License:Apache License
/** * Tests a data table that is correctly indexed. Scrutiny should report all rows as valid. */// ww w . j a v a 2 s .c om @Test public void testValidIndex() throws Exception { // insert two rows upsertRow(dataTableUpsertStmt, 1, "name-1", 94010); upsertRow(dataTableUpsertStmt, 2, "name-2", 95123); conn.commit(); int numDataRows = countRows(dataTableFullName); int numIndexRows = countRows(indexTableFullName); // scrutiny should report everything as ok List<Job> completedJobs = runScrutiny(schemaName, dataTableName, indexTableName); Job job = completedJobs.get(0); assertTrue(job.isSuccessful()); Counters counters = job.getCounters(); assertEquals(2, getCounterValue(counters, VALID_ROW_COUNT)); assertEquals(0, getCounterValue(counters, INVALID_ROW_COUNT)); // make sure row counts weren't modified by scrutiny assertEquals(numDataRows, countRows(dataTableFullName)); assertEquals(numIndexRows, countRows(indexTableFullName)); }
From source file:org.apache.phoenix.end2end.IndexScrutinyToolIT.java
License:Apache License
/** * Tests an index with the same # of rows as the data table, but one of the index rows is * incorrect Scrutiny should report the invalid rows. *///from w ww . ja v a2 s .c o m @Test public void testEqualRowCountIndexIncorrect() throws Exception { // insert one valid row upsertRow(dataTableUpsertStmt, 1, "name-1", 94010); conn.commit(); // disable the index and insert another row which is not indexed disableIndex(); upsertRow(dataTableUpsertStmt, 2, "name-2", 95123); conn.commit(); // insert a bad row into the index upsertIndexRow("badName", 2, 9999); conn.commit(); // scrutiny should report the bad row List<Job> completedJobs = runScrutiny(schemaName, dataTableName, indexTableName); Job job = completedJobs.get(0); assertTrue(job.isSuccessful()); Counters counters = job.getCounters(); assertEquals(1, getCounterValue(counters, VALID_ROW_COUNT)); assertEquals(1, getCounterValue(counters, INVALID_ROW_COUNT)); }