Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path) 

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the link analysis job. The link analysis job applies the link rank
 * formula to create a score per url and stores that score in the NodeDb.
 * /* w  w w.  j  av  a  2  s  . co m*/
 * Typically the link analysis job is run a number of times to allow the link
 * rank scores to converge.
 * 
 * @param nodeDb The node database from which we are getting previous link
 * rank scores.
 * @param inverted The inverted inlinks
 * @param output The link analysis output.
 * @param iteration The current iteration number.
 * @param numIterations The total number of link analysis iterations
 * 
 * @throws IOException If an error occurs during link analysis.
 */
private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
        float rankOne) throws IOException {

    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    LOG.info("Starting analysis job");
    try {
        JobClient.runJob(analyzer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished analysis job.");
}

From source file:org.apache.nutch.scoring.webgraph.Loops.java

License:Apache License

/**
 * Runs the various loop jobs.//  w w  w.j ava  2  s . c  o m
 */
public void findLoops(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Loops: starting at " + sdf.format(start));
        LOG.info("Loops: webgraphdb: " + webGraphDb);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb,
            ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("Loops: starting initializer");
        JobClient.runJob(init);
        LOG.info("Loops: installing initializer " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished initializer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {

        JobConf looper = new NutchJob(conf);
        looper.setJobName("Looper: " + (i + 1) + " of " + depth);
        FileInputFormat.addInputPath(looper, outlinkDb);
        FileInputFormat.addInputPath(looper, routes);
        looper.setInputFormat(SequenceFileInputFormat.class);
        looper.setMapperClass(Looper.class);
        looper.setReducerClass(Looper.class);
        looper.setMapOutputKeyClass(Text.class);
        looper.setMapOutputValueClass(ObjectWritable.class);
        looper.setOutputKeyClass(Text.class);
        looper.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(looper, tempRoute);
        looper.setOutputFormat(SequenceFileOutputFormat.class);
        looper.setBoolean("last", i == (depth - 1));

        try {
            LOG.info("Loops: starting looper");
            JobClient.runJob(looper);
            LOG.info("Loops: installing looper " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished looper");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("Loops: starting finalizer");
        JobClient.runJob(finalizer);
        LOG.info("Loops: finished finalizer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java

License:Apache License

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN/*from  w w  w . j a  v  a  2s. c om*/
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType,
        AggrType aggrType, boolean asSequenceFile) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();

    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);

    if (nameType == null) {
        dumper.setMapperClass(Sorter.class);
        dumper.setReducerClass(Sorter.class);
        dumper.setMapOutputKeyClass(FloatWritable.class);
        dumper.setMapOutputValueClass(Text.class);
    } else {
        dumper.setMapperClass(Dumper.class);
        dumper.setReducerClass(Dumper.class);
        dumper.setMapOutputKeyClass(Text.class);
        dumper.setMapOutputValueClass(FloatWritable.class);
    }

    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);

    if (asSequenceFile) {
        dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        dumper.setOutputFormat(TextOutputFormat.class);
    }

    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);

    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);

    dumper.setLong("topn", topN);

    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
        dumper.set("mapred.textoutputformat.separator", "=");
    }

    try {
        LOG.info("NodeDumper: running");
        JobClient.runJob(dumper);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.ScoreUpdater.java

License:Apache License

/**
 * Updates the inlink score in the web graph node databsae into the crawl 
 * database.//from  w w w . ja v a 2  s  .com
 * 
 * @param crawlDb The crawl database to update
 * @param webGraphDb The webgraph database to use.
 * 
 * @throws IOException If an error occurs while updating the scores.
 */
public void update(Path crawlDb, Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("ScoreUpdater: starting at " + sdf.format(start));

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // create a temporary crawldb with the new scores
    LOG.info("Running crawldb update " + crawlDb);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the updater job outputting to the temp crawl database
    JobConf updater = new NutchJob(conf);
    updater.setJobName("Update CrawlDb from WebGraph");
    FileInputFormat.addInputPath(updater, crawlDbCurrent);
    FileInputFormat.addInputPath(updater, nodeDb);
    FileOutputFormat.setOutputPath(updater, newCrawlDb);
    updater.setInputFormat(SequenceFileInputFormat.class);
    updater.setMapperClass(ScoreUpdater.class);
    updater.setReducerClass(ScoreUpdater.class);
    updater.setMapOutputKeyClass(Text.class);
    updater.setMapOutputValueClass(ObjectWritable.class);
    updater.setOutputKeyClass(Text.class);
    updater.setOutputValueClass(CrawlDatum.class);
    updater.setOutputFormat(MapFileOutputFormat.class);

    try {
        JobClient.runJob(updater);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));

        // remove the temp crawldb on error
        if (fs.exists(newCrawlDb)) {
            fs.delete(newCrawlDb, true);
        }
        throw e;
    }

    // install the temp crawl database
    LOG.info("ScoreUpdater: installing new crawldb " + crawlDb);
    CrawlDb.install(updater, crawlDb);

    long end = System.currentTimeMillis();
    LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.WebGraph.java

License:Apache License

/**
 * Creates the three different WebGraph databases, Outlinks, Inlinks, and
 * Node. If a current WebGraph exists then it is updated, if it doesn't exist
 * then a new WebGraph database is created.
 * //from  w  w w  . ja  v  a2 s  . c  om
 * @param webGraphDb The WebGraph to create or update.
 * @param segments The array of segments used to update the WebGraph. Newer
 * segments and fetch times will overwrite older segments.
 * @param normalize whether to use URLNormalizers on URL's in the segment
 * @param filter whether to use URLFilters on URL's in the segment
 * 
 * @throws IOException If an error occurs while processing the WebGraph.
 */
public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter)
        throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("WebGraphDb: starting at " + sdf.format(start));
        LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
        LOG.info("WebGraphDb: URL normalize: " + normalize);
        LOG.info("WebGraphDb: URL filter: " + filter);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // lock an existing webgraphdb to prevent multiple simultaneous updates
    Path lock = new Path(webGraphDb, LOCK_NAME);
    if (!fs.exists(webGraphDb)) {
        fs.mkdirs(webGraphDb);
    }

    LockUtil.createLockFile(fs, lock, false);

    // outlink and temp outlink database paths
    Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
    Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);

    if (!fs.exists(outlinkDb)) {
        fs.mkdirs(outlinkDb);
    }

    Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf outlinkJob = new NutchJob(conf);
    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);

    boolean deleteGone = conf.getBoolean("link.delete.gone", false);
    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);

    if (deleteGone) {
        LOG.info("OutlinkDb: deleting gone links");
    }

    // get the parse data and crawl fetch data for all segments
    if (segments != null) {
        for (int i = 0; i < segments.length; i++) {
            Path parseData = new Path(segments[i], ParseData.DIR_NAME);
            if (fs.exists(parseData)) {
                LOG.info("OutlinkDb: adding input: " + parseData);
                FileInputFormat.addInputPath(outlinkJob, parseData);
            }

            if (deleteGone) {
                Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
                if (fs.exists(crawlFetch)) {
                    LOG.info("OutlinkDb: adding input: " + crawlFetch);
                    FileInputFormat.addInputPath(outlinkJob, crawlFetch);
                }
            }
        }
    }

    // add the existing webgraph
    LOG.info("OutlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(outlinkJob, outlinkDb);

    outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
    outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter);

    outlinkJob.setInputFormat(SequenceFileInputFormat.class);
    outlinkJob.setMapperClass(OutlinkDb.class);
    outlinkJob.setReducerClass(OutlinkDb.class);
    outlinkJob.setMapOutputKeyClass(Text.class);
    outlinkJob.setMapOutputValueClass(NutchWritable.class);
    outlinkJob.setOutputKeyClass(Text.class);
    outlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
    outlinkJob.setOutputFormat(MapFileOutputFormat.class);
    outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the outlinkdb job and replace any old outlinkdb with the new one
    try {
        LOG.info("OutlinkDb: running");
        JobClient.runJob(outlinkJob);
        LOG.info("OutlinkDb: installing " + outlinkDb);
        FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
        FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
        if (!preserveBackup && fs.exists(oldOutlinkDb))
            fs.delete(oldOutlinkDb, true);
        LOG.info("OutlinkDb: finished");
    } catch (IOException e) {

        // remove lock file and and temporary directory if an error occurs
        LockUtil.removeLockFile(fs, lock);
        if (fs.exists(tempOutlinkDb)) {
            fs.delete(tempOutlinkDb, true);
        }
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // inlink and temp link database paths
    Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
    Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf inlinkJob = new NutchJob(conf);
    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
    LOG.info("InlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(inlinkJob, outlinkDb);
    inlinkJob.setInputFormat(SequenceFileInputFormat.class);
    inlinkJob.setMapperClass(InlinkDb.class);
    inlinkJob.setMapOutputKeyClass(Text.class);
    inlinkJob.setMapOutputValueClass(LinkDatum.class);
    inlinkJob.setOutputKeyClass(Text.class);
    inlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
    inlinkJob.setOutputFormat(MapFileOutputFormat.class);
    inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    try {

        // run the inlink and replace any old with new
        LOG.info("InlinkDb: running");
        JobClient.runJob(inlinkJob);
        LOG.info("InlinkDb: installing " + inlinkDb);
        FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
        LOG.info("InlinkDb: finished");
    } catch (IOException e) {

        // remove lock file and and temporary directory if an error occurs
        LockUtil.removeLockFile(fs, lock);
        if (fs.exists(tempInlinkDb)) {
            fs.delete(tempInlinkDb, true);
        }
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // node and temp node database paths
    Path nodeDb = new Path(webGraphDb, NODE_DIR);
    Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf nodeJob = new NutchJob(conf);
    nodeJob.setJobName("NodeDb " + nodeDb);
    LOG.info("NodeDb: adding input: " + outlinkDb);
    LOG.info("NodeDb: adding input: " + inlinkDb);
    FileInputFormat.addInputPath(nodeJob, outlinkDb);
    FileInputFormat.addInputPath(nodeJob, inlinkDb);
    nodeJob.setInputFormat(SequenceFileInputFormat.class);
    nodeJob.setReducerClass(NodeDb.class);
    nodeJob.setMapOutputKeyClass(Text.class);
    nodeJob.setMapOutputValueClass(LinkDatum.class);
    nodeJob.setOutputKeyClass(Text.class);
    nodeJob.setOutputValueClass(Node.class);
    FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
    nodeJob.setOutputFormat(MapFileOutputFormat.class);
    nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    try {

        // run the node job and replace old nodedb with new
        LOG.info("NodeDb: running");
        JobClient.runJob(nodeJob);
        LOG.info("NodeDb: installing " + nodeDb);
        FSUtils.replace(fs, nodeDb, tempNodeDb, true);
        LOG.info("NodeDb: finished");
    } catch (IOException e) {

        // remove lock file and and temporary directory if an error occurs
        LockUtil.removeLockFile(fs, lock);
        if (fs.exists(tempNodeDb)) {
            fs.delete(tempNodeDb, true);
        }
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // remove the lock file for the webgraph
    LockUtil.removeLockFile(fs, lock);

    long end = System.currentTimeMillis();
    LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.segment.SegmentMerger.java

License:Apache License

public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception {
    String segmentName = Generator.generateSegmentName();
    if (LOG.isInfoEnabled()) {
        LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
    }//from ww  w. j a v a  2s  . c  o m
    JobConf job = new NutchJob(getConf());
    job.setJobName("mergesegs " + out + "/" + segmentName);
    job.setBoolean("segment.merger.filter", filter);
    job.setBoolean("segment.merger.normalizer", normalize);
    job.setLong("segment.merger.slice", slice);
    job.set("segment.merger.segmentName", segmentName);
    FileSystem fs = FileSystem.get(getConf());
    // prepare the minimal common set of input dirs
    boolean g = true;
    boolean f = true;
    boolean p = true;
    boolean c = true;
    boolean pd = true;
    boolean pt = true;
    for (int i = 0; i < segs.length; i++) {
        if (!fs.exists(segs[i])) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
            }
            segs[i] = null;
            continue;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("SegmentMerger:   adding " + segs[i]);
        }
        Path cDir = new Path(segs[i], Content.DIR_NAME);
        Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
        Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
        Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
        Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
        Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
        c = c && fs.exists(cDir);
        g = g && fs.exists(gDir);
        f = f && fs.exists(fDir);
        p = p && fs.exists(pDir);
        pd = pd && fs.exists(pdDir);
        pt = pt && fs.exists(ptDir);
    }
    StringBuffer sb = new StringBuffer();
    if (c)
        sb.append(" " + Content.DIR_NAME);
    if (g)
        sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
    if (f)
        sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
    if (p)
        sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
    if (pd)
        sb.append(" " + ParseData.DIR_NAME);
    if (pt)
        sb.append(" " + ParseText.DIR_NAME);
    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentMerger: using segment data from:" + sb.toString());
    }
    for (int i = 0; i < segs.length; i++) {
        if (segs[i] == null)
            continue;
        if (g) {
            Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, gDir);
        }
        if (c) {
            Path cDir = new Path(segs[i], Content.DIR_NAME);
            FileInputFormat.addInputPath(job, cDir);
        }
        if (f) {
            Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
            FileInputFormat.addInputPath(job, fDir);
        }
        if (p) {
            Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
            FileInputFormat.addInputPath(job, pDir);
        }
        if (pd) {
            Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
            FileInputFormat.addInputPath(job, pdDir);
        }
        if (pt) {
            Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
            FileInputFormat.addInputPath(job, ptDir);
        }
    }
    job.setInputFormat(ObjectInputFormat.class);
    job.setMapperClass(SegmentMerger.class);
    job.setReducerClass(SegmentMerger.class);
    FileOutputFormat.setOutputPath(job, out);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MetaWrapper.class);
    job.setOutputFormat(SegmentOutputFormat.class);

    setConf(job);

    JobClient.runJob(job);
}

From source file:org.apache.nutch.segment.SegmentReader.java

License:Apache License

public void dump(Path segment, Path output) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentReader: dump segment: " + segment);
    }//  w w  w. j a  v  a  2 s  .  c  o m

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge)
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe)
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa)
        FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co)
        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    if (pd)
        FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    if (pt)
        FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentReader.class);

    Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new java.util.Random().nextInt());
    fs.delete(tempDir, true);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    // concatenate the output
    Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));

    // remove the old file
    fs.delete(dumpFile, true);
    FileStatus[] fstats = fs.listStatus(tempDir, HadoopFSUtil.getPassAllFilter());
    Path[] files = HadoopFSUtil.getPaths(fstats);

    PrintWriter writer = null;
    int currentRecordNumber = 0;
    if (files.length > 0) {
        writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(dumpFile))));
        try {
            for (int i = 0; i < files.length; i++) {
                Path partFile = (Path) files[i];
                try {
                    currentRecordNumber = append(fs, job, partFile, writer, currentRecordNumber);
                } catch (IOException exception) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't copy the content of " + partFile.toString() + " into "
                                + dumpFile.toString());
                        LOG.warn(exception.getMessage());
                    }
                }
            }
        } finally {
            writer.close();
        }
    }
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentReader: done");
    }
}

From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java

License:Apache License

public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException {

    checkConfiguration();//from ww w .j av  a2 s.c om

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    // push the zipped_webdriver binaries onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zippedDriverPath), job);

    job.set("webdriver.binaries.path", zippedDriverPath);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(SeleniumFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.tools.arc.ArcSegmentCreator.java

License:Apache License

/**
 * <p>Creates the arc files to segments job.</p>
 * //from w ww  .  java 2 s  .c  o  m
 * @param arcFiles The path to the directory holding the arc files
 * @param segmentsOutDir The output directory for writing the segments
 * 
 * @throws IOException If an IO error occurs while running the job.
 */
public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
        LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("ArcSegmentCreator " + arcFiles);
    String segName = generateSegmentName();
    job.set(Nutch.SEGMENT_NAME_KEY, segName);
    FileInputFormat.addInputPath(job, arcFiles);
    job.setInputFormat(ArcInputFormat.class);
    job.setMapperClass(ArcSegmentCreator.class);
    FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName));
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.tools.compat.ReprUrlFixer.java

License:Apache License

/**
 * Run the fixer on any crawl database and segments specified.
 *//*  www  .jav a2 s  .c o  m*/
public void update(Path crawlDb, Path[] segments) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("ReprUrlFixer: starting at " + sdf.format(start));

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // run the crawl database through the repr fixer
    if (crawlDb != null) {

        LOG.info("ReprUrlFixer: crawlDb " + crawlDb);
        Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
        Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        JobConf updater = new NutchJob(conf);
        updater.setJobName("ReprUtilFixer: " + crawlDb.toString());
        FileInputFormat.addInputPath(updater, crawlDbCurrent);
        FileOutputFormat.setOutputPath(updater, newCrawlDb);
        updater.setInputFormat(SequenceFileInputFormat.class);
        updater.setReducerClass(ReprUrlFixer.class);
        updater.setOutputKeyClass(Text.class);
        updater.setOutputValueClass(CrawlDatum.class);
        updater.setOutputFormat(MapFileOutputFormat.class);

        try {
            JobClient.runJob(updater);
            LOG.info("ReprUrlFixer: installing new crawldb " + crawlDb);
            CrawlDb.install(updater, crawlDb);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the segments through the repr fixer, logic will be run on both the
    // crawl_parse and the crawl_fetch directories for every segment specified
    if (segments != null) {

        for (int i = 0; i < segments.length; i++) {

            Path segment = segments[i];
            LOG.info("ReprUrlFixer: fetching segment " + segment);
            Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
            Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-"
                    + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

            JobConf fetch = new NutchJob(conf);
            fetch.setJobName("ReprUrlFixer: " + segment.toString());
            FileInputFormat.addInputPath(fetch, segFetch);
            FileOutputFormat.setOutputPath(fetch, newSegFetch);
            fetch.setInputFormat(SequenceFileInputFormat.class);
            fetch.setReducerClass(ReprUrlFixer.class);
            fetch.setOutputKeyClass(Text.class);
            fetch.setOutputValueClass(CrawlDatum.class);
            fetch.setOutputFormat(MapFileOutputFormat.class);

            try {
                JobClient.runJob(fetch);
                LOG.info("ReprUrlFixer: installing new segment fetch directory " + newSegFetch);
                FSUtils.replace(fs, segFetch, newSegFetch, true);
                LOG.info("ReprUrlFixer: finished installing segment fetch directory");
            } catch (IOException e) {
                LOG.error(StringUtils.stringifyException(e));
                throw e;
            }

            LOG.info("ReprUrlFixer: parsing segment " + segment);
            Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME);
            Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-"
                    + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

            JobConf parse = new NutchJob(conf);
            parse.setJobName("ReprUrlFixer: " + segment.toString());
            FileInputFormat.addInputPath(parse, segParse);
            FileOutputFormat.setOutputPath(parse, newSegParse);
            parse.setInputFormat(SequenceFileInputFormat.class);
            parse.setReducerClass(ReprUrlFixer.class);
            parse.setOutputKeyClass(Text.class);
            parse.setOutputValueClass(CrawlDatum.class);
            parse.setOutputFormat(MapFileOutputFormat.class);

            try {
                JobClient.runJob(parse);
                LOG.info("ReprUrlFixer: installing new segment parse directry " + newSegParse);
                FSUtils.replace(fs, segParse, newSegParse, true);
                LOG.info("ReprUrlFixer: finished installing segment parse directory");
            } catch (IOException e) {
                LOG.error(StringUtils.stringifyException(e));
                throw e;
            }
        }
    }

    long end = System.currentTimeMillis();
    LOG.info("ReprUrlFixer: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}