Example usage for org.apache.hadoop.mapreduce Job waitForCompletion

List of usage examples for org.apache.hadoop.mapreduce Job waitForCompletion

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job waitForCompletion.

Prototype

public boolean waitForCompletion(boolean verbose)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Submit the job to the cluster and wait for it to finish.

Usage

From source file:com.ifeng.vdn.iparea.parser.IPAreaDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//  w  w w . j a  va  2 s  . c  o m

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(IPAreaMapper.class);
    job.setReducerClass(IPAreaReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.iparea.parser.IPAreaLocalDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//from  w  ww.  j  a v  a  2 s  .co m

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(IPAreaMapper.class);
    job.setReducerClass(IPAreaReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.ifeng.vdn.loggroup.mapper.VideologGroupDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/* www  . ja v  a  2s  . co m*/

    Job job = Job.getInstance(super.getConf());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogGroupMapper.class);
    job.setReducerClass(VideologGroupReducer.class);
    job.setCombinerClass(VideologGroupReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.logparser.mapper.VideoLogDriver.java

License:Apache License

@Override
public int run(String[] paths) throws Exception {
    Job job = Job.getInstance(super.getConf());
    job.setJarByClass(getClass());/*from   ww w.  j a v  a 2  s. c  o  m*/

    FileInputFormat.addInputPath(job, new Path(paths[0]));
    FileOutputFormat.setOutputPath(job, new Path(paths[1]));

    job.setMapperClass(VideoLogMapper.class);
    job.setReducerClass(VideoLogReducer.class);
    job.setCombinerClass(VideoLogReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.parser.VideoLogParseDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//  w  w  w. java2 s .  c  o m

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogParseMapper.class);
    job.setReducerClass(VideoLogParseReducer.class);
    job.setCombinerClass(VideoLogParseReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.ifeng.vdn.parser.VideoLogParseLocalDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(getClass());//from  w  w w.  j a  v a2 s. co  m

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(VideoLogParseMapper.class);
    job.setReducerClass(VideoLogParseReducer.class);
    job.setCombinerClass(VideoLogParseReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.ifeng.vdn.videolog.sort.SortGroupResultPreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//from  ww  w  .j  av a  2  s . co m

    Job job = Job.getInstance(getConf());
    job.setMapperClass(SortGroupResultMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    // Sort data by total number:
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean additionsAllowed, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(getConf());
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.createLockFile(fs, lock, force);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: starting");
        LOG.info("CrawlDb update: db: " + crawlDb);
        LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
        LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
    }// ww w  .  j  ava 2 s.c  o m

    Job job = CrawlDb.createJob(getConf(), crawlDb);
    job.getConfiguration().setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
    for (int i = 0; i < segments.length; i++) {
        Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
        Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
        if (fs.exists(fetch)) {
            FileInputFormat.addInputPath(job, fetch);
        }
        if (fs.exists(parse)) {
            FileInputFormat.addInputPath(job, parse);
        } else {
            LOG.info(" - skipping invalid segment " + segments[i]);
        }
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: Merging segment data into db.");
    }
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    } catch (InterruptedException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    } catch (ClassNotFoundException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    }

    CrawlDb.install(job, crawlDb);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: done");
    }
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from w w w.  ja va 2s  .c  o m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
        throws IOException, InterruptedException, ClassNotFoundException {
    // invert again, partition by host/domain/IP, sort by url hash
    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
    }//from  w ww.  j av a 2  s. com
    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

    LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("generate: partition " + segment);
    job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(AveragePartition.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.waitForCompletion(true);
    return segment;
}