com.iflytek.spider.crawl.GeneratorSmart.java Source code

Introduction

Here is the source code for com.iflytek.spider.crawl.GeneratorSmart.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.iflytek.spider.crawl;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;

import org.apache.commons.lang.math.RandomUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.iflytek.avro.mapreduce.AvroJob;
import com.iflytek.avro.mapreduce.input.AvroPairInputFormat;
import com.iflytek.avro.mapreduce.output.AvroMapOutputFormat;
import com.iflytek.avro.mapreduce.output.AvroPairOutputFormat;
import com.iflytek.spider.metadata.Spider;
import com.iflytek.spider.util.LockUtil;
import com.iflytek.spider.util.SpiderConfiguration;

/* its for several segments in one go. Unlike in the initial version
 * (OldGenerator), the IP resolution is done ONLY on the entries which have been
 * selected for fetching. The URLs are partitioned by IP, domain or host within
 * a segment. We can chose separately how to count the URLS i.e. by domain or
 * host to limit the entries.
 **/
public class GeneratorSmart extends Configured implements Tool {

    public static final Log LOG = LogFactory.getLog(GeneratorSmart.class);

    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String GENERATOR_DELAY = "crawl.gen.delay";
    public static final String GENERATOR_CUR_TIME = "generate.curTime";
    public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
    public static final String GENERATOR_COUNT_PER_SEGMENTS = "generate.count.per.segments";

    public static class SelectorEntry {
        public String url;
        public CrawlDatum datum;
        public int segnum;
    }

    /** Selects entries due for fetch. */
    public static class SelectorMapper extends Mapper<String, CrawlDatum, Float, SelectorEntry> {
        private long genTime = System.currentTimeMillis();
        private long curTime;

        private SelectorEntry entry = new SelectorEntry();
        private long genDelay;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            curTime = context.getConfiguration().getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
            long time = context.getConfiguration().getLong(Spider.GENERATE_TIME_KEY, 0L);
            if (time > 0)
                genTime = time;
            genDelay = context.getConfiguration().getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
        }

        @Override
        protected void map(String key, CrawlDatum crawlDatum, Context context)
                throws IOException, InterruptedException {

            if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_FETCHED)
                return;
            if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
                return;
            if (key.toString().matches("http://www.qidian.com/BookReader/\\d+,0.aspx"))
                return;
            if (crawlDatum.getMetaData().get(Spider.GENERATE_TIME_KEY) != null) {
                String oldgen = crawlDatum.getMetaData().get(Spider.GENERATE_TIME_KEY).toString();

                if (oldgen != null) { // awaiting fetch & update
                    long oldGenTime = Long.parseLong(oldgen);
                    if (oldGenTime + genDelay > curTime) // still wait for
                        // update
                        return;
                }
            }

            // record generation time
            crawlDatum.setMeta(Spider.GENERATE_TIME_KEY, String.valueOf(genTime));
            entry.datum = crawlDatum;
            entry.url = key;
            context.write(RandomUtils.nextFloat(), entry);
        }
    }

    public static class SelectorReducer extends Reducer<Float, SelectorEntry, Float, SelectorEntry> {
        private long count = 0;
        private long limit;
        int currentsegmentnum = 1;
        //private AvroMultipleOutputs<Float, SelectorEntry>   mos;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            limit = context.getConfiguration().getInt(GENERATOR_COUNT_PER_SEGMENTS, 5000);
            //mos = new AvroMultipleOutputs<Float, SelectorEntry>(context);
        }

        @Override
        protected void reduce(Float key, Iterable<SelectorEntry> values, Context context)
                throws IOException, InterruptedException {
            for (SelectorEntry entry : values) {

                if (count >= limit) {
                    // do we have any segments left?
                    count = 0;
                    currentsegmentnum++;
                }

                entry.segnum = currentsegmentnum;
                //mos.write(key, entry, generateFileNameForKeyValue(key, entry));
                context.write(key, entry);
                count++;
            }
        }

        //      @Override
        //      protected void cleanup(Context context) throws IOException,
        //            InterruptedException {
        //         mos.close();
        //      }
    }

    public static class SelectorInverseMapper extends Mapper<Float, SelectorEntry, String, SelectorEntry> {
        int numUrls = 0;

        @Override
        protected void map(Float key, SelectorEntry value, Context context)
                throws IOException, InterruptedException {
            value.segnum = numUrls;
            numUrls++;
            context.write(value.url.toString(), value);
        }

    }

    public static class AveragePartition extends Partitioner<String, SelectorEntry> {
        @Override
        public int getPartition(String key, SelectorEntry value, int numPartitions) {

            return value.segnum % numPartitions;
        }
    }

    public static class PartitionReducer extends Reducer<String, SelectorEntry, String, CrawlDatum> {

        protected void reduce(String key, Iterable<SelectorEntry> values, Context context)
                throws IOException, InterruptedException {
            for (SelectorEntry value : values) {
                context.write(key, value.datum);
            }
        }

    }

    /**
     * Update the CrawlDB so that the next generate won't include the same URLs.
     */
    public static class CrawlDbUpdateMapper extends Mapper<String, CrawlDatum, String, CrawlDatum> {
    }

    public static class CrawlDbUpdateReducer extends Reducer<String, CrawlDatum, String, CrawlDatum> {
        long generateTime;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            generateTime = context.getConfiguration().getLong(Spider.GENERATE_TIME_KEY, 0L);
        }

        private CrawlDatum orig = new CrawlDatum();
        private long genTime = 0;

        @Override
        protected void reduce(String key, Iterable<CrawlDatum> values, Context context)
                throws IOException, InterruptedException {
            for (CrawlDatum val : values) {
                if (val.getMetaData().containsKey(Spider.GENERATE_TIME_KEY)) {
                    genTime = Long.parseLong(val.getMetaData().get(Spider.GENERATE_TIME_KEY).toString());
                    if (genTime != generateTime) {
                        orig.set(val);
                        genTime = 0;
                        continue;
                    }
                } else {
                    orig.set(val);
                }
            }
            if (genTime != 0) {
                orig.getMetaData().put(Spider.GENERATE_TIME_KEY, String.valueOf(genTime));
            }
            context.write(key, orig);
        }
    }

    public GeneratorSmart() {
    }

    public GeneratorSmart(Configuration conf) {
        setConf(conf);
    }

    /**
     * Generate fetchlists in one or more segments. Whether to filter URLs or not
     * is read from the crawl.generate.filter property in the configuration files.
     * If the property is not found, the URLs are filtered. Same for the
     * normalisation.
     * 
     * @param dbDir
     *          Crawl database directory
     * @param segments
     *          Segments directory
     * @param numLists
     *          Number of reduce tasks
     * @param curTime
     *          Current time in milliseconds
     * 
     * @return Path to generated segment or null if no entries were selected
     * 
     * @throws IOException
     *           When an I/O error occurs
     * @throws ClassNotFoundException
     * @throws InterruptedException
     */
    public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
            throws IOException, InterruptedException, ClassNotFoundException {
        //getConf().set("mapred.temp.dir", "d:/tmp");
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: starting");

        Job job = AvroJob.getAvroJob(getConf());
        if (numLists == -1) { // for politeness make
            numLists = job.getNumReduceTasks(); // a partition per fetch task
        }
        if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        LOG.info("Generator: with " + numLists + " partition.");
        job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);

        job.setMapperClass(SelectorMapper.class);
        job.setReducerClass(SelectorReducer.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        //job.setOutputFormatClass(AvroPairOutputFormat.class);
        job.setOutputFormatClass(GeneratorOutputFormat.class);
        job.setOutputKeyClass(Float.class);
        job.setOutputValueClass(SelectorEntry.class);
        // AvroMultipleOutputs.addNamedOutput(job, "seq",
        // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
        try {
            job.waitForCompletion(true);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-"))
                    continue;
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

                fs.createNewFile(new Path(newSeg, "generatored"));
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("Generator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("Generator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = AvroJob.getAvroJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormatClass(AvroPairInputFormat.class);
            job.setMapperClass(CrawlDbUpdateMapper.class);
            // job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormatClass(AvroMapOutputFormat.class);
            job.setOutputKeyClass(String.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                job.waitForCompletion(true);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        if (LOG.isInfoEnabled()) {
            LOG.info("Generator: done.");
        }
        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    }

    private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
            throws IOException, InterruptedException, ClassNotFoundException {
        // invert again, partition by host/domain/IP, sort by url hash
        if (LOG.isInfoEnabled()) {
            LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
        }
        Path segment = new Path(segmentsDir, generateSegmentName());
        Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

        LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

        Job job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: partition " + segment);
        job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

        FileInputFormat.addInputPath(job, inputDir);
        job.setInputFormatClass(AvroPairInputFormat.class);

        job.setMapperClass(SelectorInverseMapper.class);
        job.setPartitionerClass(AveragePartition.class);
        job.setMapOutputKeyClass(String.class);
        job.setMapOutputValueClass(SelectorEntry.class);
        job.setReducerClass(PartitionReducer.class);
        job.setNumReduceTasks(numLists);

        FileOutputFormat.setOutputPath(job, output);
        job.setOutputFormatClass(AvroPairOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);

        job.waitForCompletion(true);
        return segment;
    }

    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000);
        } catch (Throwable t) {
        }
        ;
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    /**
     * Generate a fetchlist from the crawldb.
     */
    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(SpiderConfiguration.create(), new GeneratorSmart(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.out.println("Usage: Generator <crawldb> <segments_dir> [-force] [-numFetchers numFetchers]");
            return -1;
        }

        Path dbDir = new Path(args[0]);
        Path segmentsDir = new Path(args[1]);
        long curTime = System.currentTimeMillis();
        int numFetchers = -1;
        boolean force = false;

        for (int i = 2; i < args.length; i++) {
            if ("-numFetchers".equals(args[i])) {
                numFetchers = Integer.parseInt(args[i + 1]);
                i++;
            } else if ("-force".equals(args[i])) {
                force = true;
            }

        }

        try {
            Path[] segs = generate(dbDir, segmentsDir, numFetchers, curTime, force);
            if (segs == null)
                return -1;
        } catch (Exception e) {
            LOG.fatal("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
        return 0;
    }
}