List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the link analysis job. The link analysis job applies the link rank * formula to create a score per url and stores that score in the NodeDb. * /* w w w. j av a 2 s . co m*/ * Typically the link analysis job is run a number of times to allow the link * rank scores to converge. * * @param nodeDb The node database from which we are getting previous link * rank scores. * @param inverted The inverted inlinks * @param output The link analysis output. * @param iteration The current iteration number. * @param numIterations The total number of link analysis iterations * * @throws IOException If an error occurs during link analysis. */ private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations, float rankOne) throws IOException { JobConf analyzer = new NutchJob(getConf()); analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1)); analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); analyzer.set("link.analyze.rank.one", String.valueOf(rankOne)); analyzer.setMapOutputKeyClass(Text.class); analyzer.setMapOutputValueClass(ObjectWritable.class); analyzer.setInputFormat(SequenceFileInputFormat.class); analyzer.setMapperClass(Analyzer.class); analyzer.setReducerClass(Analyzer.class); analyzer.setOutputKeyClass(Text.class); analyzer.setOutputValueClass(Node.class); analyzer.setOutputFormat(MapFileOutputFormat.class); analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); LOG.info("Starting analysis job"); try { JobClient.runJob(analyzer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished analysis job."); }
From source file:org.apache.nutch.scoring.webgraph.Loops.java
License:Apache License
/** * Runs the various loop jobs.// w w w.j ava 2 s . c o m */ public void findLoops(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Loops: starting at " + sdf.format(start)); LOG.info("Loops: webgraphdb: " + webGraphDb); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path routes = new Path(webGraphDb, ROUTES_DIR); Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // run the initializer JobConf init = new NutchJob(conf); init.setJobName("Initializer: " + webGraphDb); FileInputFormat.addInputPath(init, outlinkDb); FileInputFormat.addInputPath(init, nodeDb); init.setInputFormat(SequenceFileInputFormat.class); init.setMapperClass(Initializer.class); init.setReducerClass(Initializer.class); init.setMapOutputKeyClass(Text.class); init.setMapOutputValueClass(ObjectWritable.class); init.setOutputKeyClass(Text.class); init.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(init, tempRoute); init.setOutputFormat(SequenceFileOutputFormat.class); try { LOG.info("Loops: starting initializer"); JobClient.runJob(init); LOG.info("Loops: installing initializer " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished initializer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } // run the loops job for a maxdepth, default 2, which will find a 3 link // loop cycle int depth = conf.getInt("link.loops.depth", 2); for (int i = 0; i < depth; i++) { JobConf looper = new NutchJob(conf); looper.setJobName("Looper: " + (i + 1) + " of " + depth); FileInputFormat.addInputPath(looper, outlinkDb); FileInputFormat.addInputPath(looper, routes); looper.setInputFormat(SequenceFileInputFormat.class); looper.setMapperClass(Looper.class); looper.setReducerClass(Looper.class); looper.setMapOutputKeyClass(Text.class); looper.setMapOutputValueClass(ObjectWritable.class); looper.setOutputKeyClass(Text.class); looper.setOutputValueClass(Route.class); FileOutputFormat.setOutputPath(looper, tempRoute); looper.setOutputFormat(SequenceFileOutputFormat.class); looper.setBoolean("last", i == (depth - 1)); try { LOG.info("Loops: starting looper"); JobClient.runJob(looper); LOG.info("Loops: installing looper " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished looper"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } // run the finalizer JobConf finalizer = new NutchJob(conf); finalizer.setJobName("Finalizer: " + webGraphDb); FileInputFormat.addInputPath(finalizer, routes); finalizer.setInputFormat(SequenceFileInputFormat.class); finalizer.setMapperClass(Finalizer.class); finalizer.setReducerClass(Finalizer.class); finalizer.setMapOutputKeyClass(Text.class); finalizer.setMapOutputValueClass(Route.class); finalizer.setOutputKeyClass(Text.class); finalizer.setOutputValueClass(LoopSet.class); FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR)); finalizer.setOutputFormat(MapFileOutputFormat.class); try { LOG.info("Loops: starting finalizer"); JobClient.runJob(finalizer); LOG.info("Loops: finished finalizer"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java
License:Apache License
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN/*from w w w . j a v a 2s. c om*/ * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.ScoreUpdater.java
License:Apache License
/** * Updates the inlink score in the web graph node databsae into the crawl * database.//from w w w . ja v a 2 s .com * * @param crawlDb The crawl database to update * @param webGraphDb The webgraph database to use. * * @throws IOException If an error occurs while updating the scores. */ public void update(Path crawlDb, Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ScoreUpdater: starting at " + sdf.format(start)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // create a temporary crawldb with the new scores LOG.info("Running crawldb update " + crawlDb); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME); Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // run the updater job outputting to the temp crawl database JobConf updater = new NutchJob(conf); updater.setJobName("Update CrawlDb from WebGraph"); FileInputFormat.addInputPath(updater, crawlDbCurrent); FileInputFormat.addInputPath(updater, nodeDb); FileOutputFormat.setOutputPath(updater, newCrawlDb); updater.setInputFormat(SequenceFileInputFormat.class); updater.setMapperClass(ScoreUpdater.class); updater.setReducerClass(ScoreUpdater.class); updater.setMapOutputKeyClass(Text.class); updater.setMapOutputValueClass(ObjectWritable.class); updater.setOutputKeyClass(Text.class); updater.setOutputValueClass(CrawlDatum.class); updater.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(updater); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); // remove the temp crawldb on error if (fs.exists(newCrawlDb)) { fs.delete(newCrawlDb, true); } throw e; } // install the temp crawl database LOG.info("ScoreUpdater: installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); long end = System.currentTimeMillis(); LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.WebGraph.java
License:Apache License
/** * Creates the three different WebGraph databases, Outlinks, Inlinks, and * Node. If a current WebGraph exists then it is updated, if it doesn't exist * then a new WebGraph database is created. * //from w w w . ja v a2 s . c om * @param webGraphDb The WebGraph to create or update. * @param segments The array of segments used to update the WebGraph. Newer * segments and fetch times will overwrite older segments. * @param normalize whether to use URLNormalizers on URL's in the segment * @param filter whether to use URLFilters on URL's in the segment * * @throws IOException If an error occurs while processing the WebGraph. */ public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("WebGraphDb: starting at " + sdf.format(start)); LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); LOG.info("WebGraphDb: URL normalize: " + normalize); LOG.info("WebGraphDb: URL filter: " + filter); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // lock an existing webgraphdb to prevent multiple simultaneous updates Path lock = new Path(webGraphDb, LOCK_NAME); if (!fs.exists(webGraphDb)) { fs.mkdirs(webGraphDb); } LockUtil.createLockFile(fs, lock, false); // outlink and temp outlink database paths Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR); Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR); if (!fs.exists(outlinkDb)) { fs.mkdirs(outlinkDb); } Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf outlinkJob = new NutchJob(conf); outlinkJob.setJobName("Outlinkdb: " + outlinkDb); boolean deleteGone = conf.getBoolean("link.delete.gone", false); boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (deleteGone) { LOG.info("OutlinkDb: deleting gone links"); } // get the parse data and crawl fetch data for all segments if (segments != null) { for (int i = 0; i < segments.length; i++) { Path parseData = new Path(segments[i], ParseData.DIR_NAME); if (fs.exists(parseData)) { LOG.info("OutlinkDb: adding input: " + parseData); FileInputFormat.addInputPath(outlinkJob, parseData); } if (deleteGone) { Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); if (fs.exists(crawlFetch)) { LOG.info("OutlinkDb: adding input: " + crawlFetch); FileInputFormat.addInputPath(outlinkJob, crawlFetch); } } } } // add the existing webgraph LOG.info("OutlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(outlinkJob, outlinkDb); outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize); outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter); outlinkJob.setInputFormat(SequenceFileInputFormat.class); outlinkJob.setMapperClass(OutlinkDb.class); outlinkJob.setReducerClass(OutlinkDb.class); outlinkJob.setMapOutputKeyClass(Text.class); outlinkJob.setMapOutputValueClass(NutchWritable.class); outlinkJob.setOutputKeyClass(Text.class); outlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb); outlinkJob.setOutputFormat(MapFileOutputFormat.class); outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the outlinkdb job and replace any old outlinkdb with the new one try { LOG.info("OutlinkDb: running"); JobClient.runJob(outlinkJob); LOG.info("OutlinkDb: installing " + outlinkDb); FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true); FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true); if (!preserveBackup && fs.exists(oldOutlinkDb)) fs.delete(oldOutlinkDb, true); LOG.info("OutlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempOutlinkDb)) { fs.delete(tempOutlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // inlink and temp link database paths Path inlinkDb = new Path(webGraphDb, INLINK_DIR); Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inlinkJob = new NutchJob(conf); inlinkJob.setJobName("Inlinkdb " + inlinkDb); LOG.info("InlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(inlinkJob, outlinkDb); inlinkJob.setInputFormat(SequenceFileInputFormat.class); inlinkJob.setMapperClass(InlinkDb.class); inlinkJob.setMapOutputKeyClass(Text.class); inlinkJob.setMapOutputValueClass(LinkDatum.class); inlinkJob.setOutputKeyClass(Text.class); inlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb); inlinkJob.setOutputFormat(MapFileOutputFormat.class); inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); try { // run the inlink and replace any old with new LOG.info("InlinkDb: running"); JobClient.runJob(inlinkJob); LOG.info("InlinkDb: installing " + inlinkDb); FSUtils.replace(fs, inlinkDb, tempInlinkDb, true); LOG.info("InlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempInlinkDb)) { fs.delete(tempInlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // node and temp node database paths Path nodeDb = new Path(webGraphDb, NODE_DIR); Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf nodeJob = new NutchJob(conf); nodeJob.setJobName("NodeDb " + nodeDb); LOG.info("NodeDb: adding input: " + outlinkDb); LOG.info("NodeDb: adding input: " + inlinkDb); FileInputFormat.addInputPath(nodeJob, outlinkDb); FileInputFormat.addInputPath(nodeJob, inlinkDb); nodeJob.setInputFormat(SequenceFileInputFormat.class); nodeJob.setReducerClass(NodeDb.class); nodeJob.setMapOutputKeyClass(Text.class); nodeJob.setMapOutputValueClass(LinkDatum.class); nodeJob.setOutputKeyClass(Text.class); nodeJob.setOutputValueClass(Node.class); FileOutputFormat.setOutputPath(nodeJob, tempNodeDb); nodeJob.setOutputFormat(MapFileOutputFormat.class); nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); try { // run the node job and replace old nodedb with new LOG.info("NodeDb: running"); JobClient.runJob(nodeJob); LOG.info("NodeDb: installing " + nodeDb); FSUtils.replace(fs, nodeDb, tempNodeDb, true); LOG.info("NodeDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempNodeDb)) { fs.delete(tempNodeDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); long end = System.currentTimeMillis(); LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.segment.SegmentMerger.java
License:Apache License
public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception { String segmentName = Generator.generateSegmentName(); if (LOG.isInfoEnabled()) { LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName); }//from ww w. j a v a 2s . c o m JobConf job = new NutchJob(getConf()); job.setJobName("mergesegs " + out + "/" + segmentName); job.setBoolean("segment.merger.filter", filter); job.setBoolean("segment.merger.normalizer", normalize); job.setLong("segment.merger.slice", slice); job.set("segment.merger.segmentName", segmentName); FileSystem fs = FileSystem.get(getConf()); // prepare the minimal common set of input dirs boolean g = true; boolean f = true; boolean p = true; boolean c = true; boolean pd = true; boolean pt = true; for (int i = 0; i < segs.length; i++) { if (!fs.exists(segs[i])) { if (LOG.isWarnEnabled()) { LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping."); } segs[i] = null; continue; } if (LOG.isInfoEnabled()) { LOG.info("SegmentMerger: adding " + segs[i]); } Path cDir = new Path(segs[i], Content.DIR_NAME); Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME); Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME); Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME); Path pdDir = new Path(segs[i], ParseData.DIR_NAME); Path ptDir = new Path(segs[i], ParseText.DIR_NAME); c = c && fs.exists(cDir); g = g && fs.exists(gDir); f = f && fs.exists(fDir); p = p && fs.exists(pDir); pd = pd && fs.exists(pdDir); pt = pt && fs.exists(ptDir); } StringBuffer sb = new StringBuffer(); if (c) sb.append(" " + Content.DIR_NAME); if (g) sb.append(" " + CrawlDatum.GENERATE_DIR_NAME); if (f) sb.append(" " + CrawlDatum.FETCH_DIR_NAME); if (p) sb.append(" " + CrawlDatum.PARSE_DIR_NAME); if (pd) sb.append(" " + ParseData.DIR_NAME); if (pt) sb.append(" " + ParseText.DIR_NAME); if (LOG.isInfoEnabled()) { LOG.info("SegmentMerger: using segment data from:" + sb.toString()); } for (int i = 0; i < segs.length; i++) { if (segs[i] == null) continue; if (g) { Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, gDir); } if (c) { Path cDir = new Path(segs[i], Content.DIR_NAME); FileInputFormat.addInputPath(job, cDir); } if (f) { Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME); FileInputFormat.addInputPath(job, fDir); } if (p) { Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME); FileInputFormat.addInputPath(job, pDir); } if (pd) { Path pdDir = new Path(segs[i], ParseData.DIR_NAME); FileInputFormat.addInputPath(job, pdDir); } if (pt) { Path ptDir = new Path(segs[i], ParseText.DIR_NAME); FileInputFormat.addInputPath(job, ptDir); } } job.setInputFormat(ObjectInputFormat.class); job.setMapperClass(SegmentMerger.class); job.setReducerClass(SegmentMerger.class); FileOutputFormat.setOutputPath(job, out); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MetaWrapper.class); job.setOutputFormat(SegmentOutputFormat.class); setConf(job); JobClient.runJob(job); }
From source file:org.apache.nutch.segment.SegmentReader.java
License:Apache License
public void dump(Path segment, Path output) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: dump segment: " + segment); }// w w w. j a v a 2 s . c o m JobConf job = createJobConf(); job.setJobName("read " + segment); if (ge) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); if (fe) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); if (pa) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); if (co) FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); if (pd) FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); if (pt) FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(InputCompatMapper.class); job.setReducerClass(SegmentReader.class); Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new java.util.Random().nextInt()); fs.delete(tempDir, true); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); // concatenate the output Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump")); // remove the old file fs.delete(dumpFile, true); FileStatus[] fstats = fs.listStatus(tempDir, HadoopFSUtil.getPassAllFilter()); Path[] files = HadoopFSUtil.getPaths(fstats); PrintWriter writer = null; int currentRecordNumber = 0; if (files.length > 0) { writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(dumpFile)))); try { for (int i = 0; i < files.length; i++) { Path partFile = (Path) files[i]; try { currentRecordNumber = append(fs, job, partFile, writer, currentRecordNumber); } catch (IOException exception) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't copy the content of " + partFile.toString() + " into " + dumpFile.toString()); LOG.warn(exception.getMessage()); } } } } finally { writer.close(); } } fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: done"); } }
From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java
License:Apache License
public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException { checkConfiguration();//from ww w .j av a2 s.c om SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); // push the zipped_webdriver binaries onto the DistributedCache DistributedCache.addCacheArchive(new URI(zippedDriverPath), job); job.set("webdriver.binaries.path", zippedDriverPath); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(SeleniumFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.tools.arc.ArcSegmentCreator.java
License:Apache License
/** * <p>Creates the arc files to segments job.</p> * //from w ww . java 2 s .c o m * @param arcFiles The path to the directory holding the arc files * @param segmentsOutDir The output directory for writing the segments * * @throws IOException If an IO error occurs while running the job. */ public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ArcSegmentCreator: starting at " + sdf.format(start)); LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); } JobConf job = new NutchJob(getConf()); job.setJobName("ArcSegmentCreator " + arcFiles); String segName = generateSegmentName(); job.set(Nutch.SEGMENT_NAME_KEY, segName); FileInputFormat.addInputPath(job, arcFiles); job.setInputFormat(ArcInputFormat.class); job.setMapperClass(ArcSegmentCreator.class); FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName)); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.tools.compat.ReprUrlFixer.java
License:Apache License
/** * Run the fixer on any crawl database and segments specified. *//* www .jav a2 s .c o m*/ public void update(Path crawlDb, Path[] segments) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReprUrlFixer: starting at " + sdf.format(start)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // run the crawl database through the repr fixer if (crawlDb != null) { LOG.info("ReprUrlFixer: crawlDb " + crawlDb); Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME); Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf updater = new NutchJob(conf); updater.setJobName("ReprUtilFixer: " + crawlDb.toString()); FileInputFormat.addInputPath(updater, crawlDbCurrent); FileOutputFormat.setOutputPath(updater, newCrawlDb); updater.setInputFormat(SequenceFileInputFormat.class); updater.setReducerClass(ReprUrlFixer.class); updater.setOutputKeyClass(Text.class); updater.setOutputValueClass(CrawlDatum.class); updater.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(updater); LOG.info("ReprUrlFixer: installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } // run the segments through the repr fixer, logic will be run on both the // crawl_parse and the crawl_fetch directories for every segment specified if (segments != null) { for (int i = 0; i < segments.length; i++) { Path segment = segments[i]; LOG.info("ReprUrlFixer: fetching segment " + segment); Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME); Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf fetch = new NutchJob(conf); fetch.setJobName("ReprUrlFixer: " + segment.toString()); FileInputFormat.addInputPath(fetch, segFetch); FileOutputFormat.setOutputPath(fetch, newSegFetch); fetch.setInputFormat(SequenceFileInputFormat.class); fetch.setReducerClass(ReprUrlFixer.class); fetch.setOutputKeyClass(Text.class); fetch.setOutputValueClass(CrawlDatum.class); fetch.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(fetch); LOG.info("ReprUrlFixer: installing new segment fetch directory " + newSegFetch); FSUtils.replace(fs, segFetch, newSegFetch, true); LOG.info("ReprUrlFixer: finished installing segment fetch directory"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("ReprUrlFixer: parsing segment " + segment); Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME); Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf parse = new NutchJob(conf); parse.setJobName("ReprUrlFixer: " + segment.toString()); FileInputFormat.addInputPath(parse, segParse); FileOutputFormat.setOutputPath(parse, newSegParse); parse.setInputFormat(SequenceFileInputFormat.class); parse.setReducerClass(ReprUrlFixer.class); parse.setOutputKeyClass(Text.class); parse.setOutputValueClass(CrawlDatum.class); parse.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(parse); LOG.info("ReprUrlFixer: installing new segment parse directry " + newSegParse); FSUtils.replace(fs, segParse, newSegParse, true); LOG.info("ReprUrlFixer: finished installing segment parse directory"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } } long end = System.currentTimeMillis(); LOG.info("ReprUrlFixer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }