List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:org.apache.nutch.indexer.field.CustomFields.java
License:Apache License
/** * MapReduce job that converts text values into FieldWritable objects. * //from ww w . j a v a 2 s. c o m * @param inputs The directories with text files to convert. * @param output The converter output directory. * * @throws IOException If an error occurs while converting. */ private void runConverter(Path[] inputs, Path output) throws IOException { JobConf converter = new NutchJob(getConf()); converter.setJobName("CustomFields Converter"); for (int i = 0; i < inputs.length; i++) { FileInputFormat.addInputPath(converter, inputs[i]); } FileOutputFormat.setOutputPath(converter, output); converter.setInputFormat(TextInputFormat.class); converter.setMapperClass(Converter.class); converter.setReducerClass(Converter.class); converter.setMapOutputKeyClass(Text.class); converter.setMapOutputValueClass(FieldWritable.class); converter.setOutputKeyClass(Text.class); converter.setOutputValueClass(FieldWritable.class); converter.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting converter job"); try { JobClient.runJob(converter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished converter job."); }
From source file:org.apache.nutch.indexer.field.CustomFields.java
License:Apache License
/** * Aggregated multiple FieldWritable objects with the same name. Depending on * settings in the custom-fields.xml file, a field may one or more fields. * This jobs aggregates fields and then collects based on the configuration * settings.//w w w . jav a 2 s . com * * @param basicFields The basicfields FieldWritable objects. * @param converted The converted custom field objects. * @param output The final output directory for custom field objects. * * @throws IOException If an error occurs while converting. */ private void runCollector(Path basicFields, Path converted, Path output) throws IOException { JobConf collector = new NutchJob(getConf()); collector.setJobName("CustomFields Collector"); FileInputFormat.addInputPath(collector, converted); FileInputFormat.addInputPath(collector, basicFields); FileOutputFormat.setOutputPath(collector, output); collector.setInputFormat(SequenceFileInputFormat.class); collector.setMapOutputKeyClass(Text.class); collector.setMapOutputValueClass(ObjectWritable.class); collector.setMapperClass(Collector.class); collector.setReducerClass(Collector.class); collector.setOutputKeyClass(Text.class); collector.setOutputValueClass(FieldWritable.class); collector.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting collector job"); try { JobClient.runJob(collector); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished collector job."); }
From source file:org.apache.nutch.indexer.field.FieldIndexer.java
License:Apache License
public void index(Path[] fields, Path indexDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FieldIndexer: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setJobName("FieldIndexer: " + indexDir); for (int i = 0; i < fields.length; i++) { Path fieldsDb = fields[i]; LOG.info("FieldIndexer: adding fields db: " + fieldsDb); FileInputFormat.addInputPath(job, fieldsDb); }/*w w w . java 2 s . co m*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(FieldIndexer.class); job.setReducerClass(FieldIndexer.class); FileOutputFormat.setOutputPath(job, indexDir); job.setOutputFormat(OutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FieldWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LuceneDocumentWrapper.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.indexer.IndexerMapReduce.java
License:Apache License
public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) { LOG.info("IndexerMapReduce: crawldb: " + crawlDb); if (linkDb != null) LOG.info("IndexerMapReduce: linkdb: " + linkDb); for (final Path segment : segments) { LOG.info("IndexerMapReduces: adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); }//from w ww .j a v a2 s . co m FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); if (linkDb != null) FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IndexerMapReduce.class); job.setReducerClass(IndexerMapReduce.class); job.setOutputFormat(IndexerOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputValueClass(NutchWritable.class); }
From source file:org.apache.nutch.indexer.solr.SolrClean.java
License:Apache License
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job);/*from w w w. j av a 2 s .c om*/ long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.parse.ParseSegment.java
License:Apache License
public void parse(Path segment) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ParseSegment: starting at " + sdf.format(start)); LOG.info("ParseSegment: segment: " + segment); }/*w w w . j av a 2 s .com*/ JobConf job = new NutchJob(getConf()); job.setJobName("parse " + segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ParseSegment.class); job.setReducerClass(ParseSegment.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(ParseOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ParseImpl.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.LinkDumper.java
License:Apache License
/** * Runs the inverter and merger jobs of the LinkDumper tool to create the * url to inlink node database./*from w ww. j a v a 2s .co m*/ */ public void dumpLinks(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path linkdump = new Path(webGraphDb, DUMP_DIR); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR); boolean loopsExists = fs.exists(loopSetDb); Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR); // run the inverter job Path tempInverted = new Path(webGraphDb, "inverted-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inverter = new NutchJob(conf); inverter.setJobName("LinkDumper: inverter"); FileInputFormat.addInputPath(inverter, nodeDb); if (loopsExists) { FileInputFormat.addInputPath(inverter, loopSetDb); } FileInputFormat.addInputPath(inverter, outlinkDb); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkNode.class); FileOutputFormat.setOutputPath(inverter, tempInverted); inverter.setOutputFormat(SequenceFileOutputFormat.class); try { LOG.info("LinkDumper: running inverter"); JobClient.runJob(inverter); LOG.info("LinkDumper: finished inverter"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } // run the merger job JobConf merger = new NutchJob(conf); merger.setJobName("LinkDumper: merger"); FileInputFormat.addInputPath(merger, tempInverted); merger.setInputFormat(SequenceFileInputFormat.class); merger.setReducerClass(Merger.class); merger.setMapOutputKeyClass(Text.class); merger.setMapOutputValueClass(LinkNode.class); merger.setOutputKeyClass(Text.class); merger.setOutputValueClass(LinkNodes.class); FileOutputFormat.setOutputPath(merger, linkdump); merger.setOutputFormat(MapFileOutputFormat.class); try { LOG.info("LinkDumper: running merger"); JobClient.runJob(merger); LOG.info("LinkDumper: finished merger"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } fs.delete(tempInverted, true); long end = System.currentTimeMillis(); LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * /*from w w w . j a v a2s .co m*/ * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the initializer job. The initializer job sets up the nodes with a * default starting score for link analysis. * //from w w w .ja v a2 s. c om * @param nodeDb The node database to use. * @param output The job output directory. * * @throws IOException If an error occurs while running the initializer job. */ private void runInitializer(Path nodeDb, Path output) throws IOException { // configure the initializer JobConf initializer = new NutchJob(getConf()); initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setInputFormat(SequenceFileInputFormat.class); initializer.setMapperClass(Initializer.class); initializer.setMapOutputKeyClass(Text.class); initializer.setMapOutputValueClass(Node.class); initializer.setOutputKeyClass(Text.class); initializer.setOutputValueClass(Node.class); initializer.setOutputFormat(MapFileOutputFormat.class); initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the initializer LOG.info("Starting initialization job"); try { JobClient.runJob(initializer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished initialization job."); }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the inverter job. The inverter job flips outlinks to inlinks to be * passed into the analysis job.//from w ww .ja va 2 s . c o m * * The inverter job takes a link loops database if it exists. It is an * optional componenet of link analysis due to its extreme computational and * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * * @param nodeDb The node database to use. * @param outlinkDb The outlink database to use. * @param loopDb The loop database to use if it exists. * @param output The output directory. * * @throws IOException If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); inverter.setJobName("LinkAnalysis Inverter"); FileInputFormat.addInputPath(inverter, nodeDb); FileInputFormat.addInputPath(inverter, outlinkDb); // add the loop database if it exists, isn't null if (loopDb != null) { FileInputFormat.addInputPath(inverter, loopDb); } FileOutputFormat.setOutputPath(inverter, output); inverter.setInputFormat(SequenceFileInputFormat.class); inverter.setMapperClass(Inverter.class); inverter.setReducerClass(Inverter.class); inverter.setMapOutputKeyClass(Text.class); inverter.setMapOutputValueClass(ObjectWritable.class); inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished inverter job."); }