org.apache.nutch.crawl.GeneratorJob2.java Source code

Introduction

Here is the source code for org.apache.nutch.crawl.GeneratorJob2.java
Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.QueryOperators;
import com.mongodb.WriteResult;

public class GeneratorJob2 extends NutchTool implements Tool {
    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String GENERATOR_MIN_SCORE = "generate.min.score";
    public static final String GENERATOR_FILTER = "generate.filter";
    public static final String GENERATOR_NORMALISE = "generate.normalise";
    public static final String GENERATOR_MAX_COUNT = "generate.max.count";
    public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
    public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
    public static final String GENERATOR_COUNT_VALUE_HOST = "host";
    public static final String GENERATOR_COUNT_VALUE_IP = "ip";
    public static final String GENERATOR_TOP_N = "generate.topN";
    public static final String GENERATOR_CUR_TIME = "generate.curTime";
    public static final String GENERATOR_DELAY = "crawl.gen.delay";
    public static final String GENERATOR_RANDOM_SEED = "generate.partition.seed";
    public static final String BATCH_ID = "generate.batch.id";
    public static final String GENERATE_COUNT = "generate.count";

    private String goraMongoAddress;
    private String goraMongoDb;

    private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    static {
        FIELDS.add(WebPage.Field.FETCH_TIME);
        FIELDS.add(WebPage.Field.SCORE);
        FIELDS.add(WebPage.Field.STATUS);
        FIELDS.add(WebPage.Field.MARKERS);
    }

    public static final Logger LOG = LoggerFactory.getLogger(GeneratorJob2.class);

    public static class SelectorEntry implements WritableComparable<SelectorEntry> {

        String url;
        float score;

        public SelectorEntry() {
        }

        public SelectorEntry(String url, float score) {
            this.url = url;
            this.score = score;
        }

        public void readFields(DataInput in) throws IOException {
            url = Text.readString(in);
            score = in.readFloat();
        }

        public void write(DataOutput out) throws IOException {
            Text.writeString(out, url);
            out.writeFloat(score);
        }

        public int compareTo(SelectorEntry se) {
            if (se.score > score)
                return 1;
            else if (se.score == score)
                return url.compareTo(se.url);
            return -1;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + url.hashCode();
            result = prime * result + Float.floatToIntBits(score);
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            SelectorEntry other = (SelectorEntry) obj;
            if (!url.equals(other.url))
                return false;
            if (Float.floatToIntBits(score) != Float.floatToIntBits(other.score))
                return false;
            return true;
        }

        /**
         * Sets url with score on this writable. Allows for writable reusing.
         * 
         * @param url
         * @param score
         */
        public void set(String url, float score) {
            this.url = url;
            this.score = score;
        }
    }

    public static class SelectorEntryComparator extends WritableComparator {
        public SelectorEntryComparator() {
            super(SelectorEntry.class, true);
        }
    }

    static {
        WritableComparator.define(SelectorEntry.class, new SelectorEntryComparator());
    }

    public GeneratorJob2() {

    }

    public GeneratorJob2(Configuration conf) {
        setConf(conf);
    }

    public Collection<WebPage.Field> getFields(Job job) {
        Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
        fields.addAll(FetchScheduleFactory.getFetchSchedule(job.getConfiguration()).getFields());
        return fields;
    }

    /** Generate a random batch id */
    public static String randomBatchId() {
        long curTime = System.currentTimeMillis();
        int randomSeed = Math.abs(new Random().nextInt());
        String batchId = (curTime / 1000) + "-" + randomSeed;
        return batchId;
    }

    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        long st = System.currentTimeMillis();
        DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        LOG.info("GeneratorJob: starting at {}", df.format(new Date()));
        String batchId = (String) args.get(Nutch.ARG_BATCH);
        if (batchId == null) {
            batchId = randomBatchId();
        }
        getConf().set(BATCH_ID, batchId);

        //uodate mongo with  batchId
        readGoraConfig();

        results.put(BATCH_ID, getConf().get(BATCH_ID));
        long generateCount = generateBatchId(getConf(), batchId);
        results.put(GENERATE_COUNT, generateCount);
        long ed = System.currentTimeMillis();
        LOG.info("GeneratorJob: finished at {}, time elapsed: {}", df.format(new Date()), (ed - st) / 1000);
        LOG.info("GeneratorJob: generated batch id: {} containing {} URLs", batchId, generateCount);
        return results;
    }

    public int generateBatchId(Configuration conf, String batchId) {
        MongoClient mongoClient = null;
        try {
            mongoClient = new MongoClient(goraMongoAddress);
            DB db = mongoClient.getDB(goraMongoDb);
            String cId = conf.get(Nutch.CRAWL_ID_KEY);
            String collPrefix = "";
            if (org.apache.commons.lang3.StringUtils.isNoneEmpty(cId)) {
                collPrefix = cId + "_";
            }
            String crawlColl = collPrefix + "webpage";
            DBCollection collOps = db.getCollection(crawlColl);
            //update({"count":{$gt:20}},{$set:{"name":"c4"}},false,true)  
            BasicDBObject q = new BasicDBObject("batchId", null);

            DBObject set = new BasicDBObject("batchId", batchId);
            set.put("markers._gnmrk_", batchId);
            BasicDBObject o = new BasicDBObject("$set", set);
            WriteResult wr = collOps.update(q, o, false, true);
            long curTime = System.currentTimeMillis();
            //taotoxht add
            q = new BasicDBObject();
            q.append("fetchTime", new BasicDBObject().append(QueryOperators.GT, curTime));
            o = new BasicDBObject();
            o.append("$set", new BasicDBObject().append("fetchTime", curTime));
            collOps.update(q, o, false, true);

            return wr.getN();
        } catch (Exception e) {
            e.printStackTrace();
            return 0;

        } finally {
            if (mongoClient != null) {
                mongoClient.close();
            }
        }

    }

    public void readGoraConfig() {
        Properties pros = new Properties();
        try {
            pros.load(getConf().getConfResourceAsReader("gora.properties"));
            goraMongoAddress = pros.getProperty("gora.mongodb.servers");
            goraMongoDb = pros.getProperty("gora.mongodb.db");

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Mark URLs ready for fetching.
     * 
     * @throws ClassNotFoundException
     * @throws InterruptedException
     * */
    public String generate(long topN, long curTime, boolean filter, boolean norm) throws Exception {

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("GeneratorJob: starting at " + sdf.format(start));
        LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
        LOG.info("GeneratorJob: starting");
        LOG.info("GeneratorJob: filtering: " + filter);
        LOG.info("GeneratorJob: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("GeneratorJob: topN: " + topN);
        }
        String batchId = getConf().get(BATCH_ID);
        Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN, Nutch.ARG_CURTIME, curTime,
                Nutch.ARG_FILTER, filter, Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
        if (batchId == null) {
            // use generated random batch id
            batchId = (String) results.get(BATCH_ID);
        }

        long finish = System.currentTimeMillis();
        long generateCount = (Long) results.get(GENERATE_COUNT);
        LOG.info("GeneratorJob: finished at " + sdf.format(finish) + ", time elapsed: "
                + TimingUtil.elapsedTime(start, finish));
        LOG.info("GeneratorJob: generated batch id: " + batchId + " containing " + generateCount + " URLs");
        if (generateCount == 0) {
            return null;
        }
        return batchId;
    }

    public int run(String[] args) throws Exception {
        if (args.length <= 0) {
            System.out.println(
                    "Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]");
            System.out
                    .println("    -topN <N>      - number of top URLs to be selected, default is Long.MAX_VALUE ");
            System.out.println(
                    "    -crawlId <id>  - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\");");
            System.out.println(
                    "    -noFilter      - do not activate the filter plugin to filter the url, default is true ");
            System.out.println(
                    "    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true ");
            System.out.println(
                    "    -adddays       - Adds numDays to the current time to facilitate crawling urls already");
            System.out.println(
                    "                     fetched sooner then db.fetch.interval.default. Default value is 0.");
            System.out.println("    -batchId       - the batch id ");
            System.out.println("----------------------");
            System.out.println("Please set the params.");
            return -1;
        }

        long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
        boolean filter = true, norm = true;

        for (int i = 0; i < args.length; i++) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[++i]);
            } else if ("-noFilter".equals(args[i])) {
                filter = false;
            } else if ("-noNorm".equals(args[i])) {
                norm = false;
            } else if ("-crawlId".equals(args[i])) {
                getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
            } else if ("-adddays".equals(args[i])) {
                long numDays = Integer.parseInt(args[++i]);
                curTime += numDays * 1000L * 60 * 60 * 24;
            } else if ("-batchId".equals(args[i])) {
                getConf().set(BATCH_ID, args[++i]);
            } else {
                System.err.println("Unrecognized arg " + args[i]);
                return -1;
            }
        }

        try {
            return (generate(topN, curTime, filter, norm) != null) ? 0 : 1;
        } catch (Exception e) {
            LOG.error("GeneratorJob: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorJob2(), args);
        System.exit(res);
    }

}