org.apache.nutch.crawl.DbUpdaterAllJob.java Source code

Introduction

Here is the source code for org.apache.nutch.crawl.DbUpdaterAllJob.java
Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.crawl;

import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;

import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.UrlWithScore.UrlOnlyPartitioner;
import org.apache.nutch.crawl.UrlWithScore.UrlScoreComparator;
import org.apache.nutch.crawl.UrlWithScore.UrlScoreComparator.UrlOnlyComparator;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DbUpdaterAllJob extends NutchTool implements Tool {

    public static final Logger LOG = LoggerFactory.getLogger(DbUpdaterAllJob.class);

    private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    static {
        FIELDS.add(WebPage.Field.SCORE);
        FIELDS.add(WebPage.Field.STATUS);
        FIELDS.add(WebPage.Field.OUTLINKS);
        FIELDS.add(WebPage.Field.INLINKS);
        FIELDS.add(WebPage.Field.STATUS);
        FIELDS.add(WebPage.Field.FETCH_INTERVAL);
        FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
        FIELDS.add(WebPage.Field.RETRIES_SINCE_FETCH);
        FIELDS.add(WebPage.Field.PREV_SIGNATURE);
        FIELDS.add(WebPage.Field.SIGNATURE);
        FIELDS.add(WebPage.Field.FETCH_TIME);
        FIELDS.add(WebPage.Field.MODIFIED_TIME);
        FIELDS.add(WebPage.Field.MARKERS);
        FIELDS.add(WebPage.Field.METADATA);
        FIELDS.add(WebPage.Field.PARSE_STATUS);

        FIELDS.add(WebPage.Field.CONFIGURL);
        FIELDS.add(WebPage.Field.CRAWLTYPE);
    }

    public static final Utf8 DISTANCE = new Utf8("dist");

    public DbUpdaterAllJob() {

    }

    public DbUpdaterAllJob(Configuration conf) {
        setConf(conf);
    }

    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        String crawlId = (String) args.get(Nutch.ARG_CRAWL);
        numJobs = 1;
        currentJobNum = 0;
        currentJob = new NutchJob(getConf(), "updateAll-table:" + (this.getConf().get(NutchConstant.BATCH_ID_KEY)));
        if (crawlId != null) {
            currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
        }
        // job.setBoolean(ALL, updateAll);
        ScoringFilters scoringFilters = new ScoringFilters(getConf());
        HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
        fields.addAll(scoringFilters.getFields());

        // Partition by {url}, sort by {url,score} and group by {url}.
        // This ensures that the inlinks are sorted by score when they enter
        // the reducer.
        currentJob.getConfiguration().set(NutchConstant.STEPZKBATCHTIME, new Date().toLocaleString());
        currentJob.setPartitionerClass(UrlOnlyPartitioner.class);
        currentJob.setSortComparatorClass(UrlScoreComparator.class);
        currentJob.setGroupingComparatorClass(UrlOnlyComparator.class);
        String batchZKId = this.getConf().get(NutchConstant.BATCH_ID_KEY);
        NutchConstant.setUrlConfig(currentJob.getConfiguration(), 2);
        LOG.info("DbUpdaterAllJob: batchId: " + batchZKId + " batchTime:"
                + NutchConstant.getBatchTime(currentJob.getConfiguration()));
        LOG.info("DbUpdaterAllJob  batchId: " + batchZKId);
        StorageUtils.initMapperJob(currentJob, fields, WebPage.class, UrlWithScore.class, NutchWritable.class,
                DbUpdateAllMapper.class);
        StorageUtils.initReducerJob(currentJob, WebPage.class, DbUpdateAllReducer.class);
        currentJob.waitForCompletion(true);
        ToolUtil.recordJobStatus(null, currentJob, results);
        return results;
    }

    private int updateTable(String crawlId) throws Exception {
        try {
            LOG.info("DbUpdaterJob: starting");
            Object res = run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId));
            LOG.info("DbUpdaterJob: done");
            if (res == null)
                return -1;
        } catch (Exception e) {
            if (NutchConstant.exitValue != 0) {
                return NutchConstant.exitValue;
            } else {
                throw e;
            }
        }
        return 0;
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: DbUpdaterJob  <-batch batchId> [-crawlId <id>] ";
        if (args.length == 0) {
            System.err.println(usage);
            return -1;
        }
        String crawlId = null;
        for (int i = 0; i < args.length; i++) {
            if ("-batch".equals(args[i])) {
                String batchId1 = org.apache.commons.lang.StringUtils.lowerCase(args[i + 1]);
                if (batchId1 != null && !batchId1.equals("")) {
                    getConf().set(NutchConstant.BATCH_ID_KEY, batchId1);
                }
                i++;
            } else if ("-crawlId".equals(args[i])) {
                getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
            } else if ("-batch".equals(args[i])) {
                String batchId1 = org.apache.commons.lang.StringUtils.lowerCase(args[i + 1]);
                if (batchId1 != null && !batchId1.equals("")) {
                    getConf().set(NutchConstant.BATCH_ID_KEY, batchId1);
                }
                i++;
            } else {
                throw new IllegalArgumentException("usage: " + "<-batch batchId> (-crawlId <id>) ");
            }
        }
        return updateTable(crawlId);
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new DbUpdaterAllJob(), args);
        System.exit(res);
    }

}