net.peacesoft.nutch.crawl.ReIndexerMapReduce.java Source code

Introduction

Here is the source code for net.peacesoft.nutch.crawl.ReIndexerMapReduce.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.peacesoft.nutch.crawl;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.indexer.IndexerMapReduce;
import org.apache.nutch.indexer.IndexerOutputFormat;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilters;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchIndexAction;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ReIndexerMapReduce extends Configured implements Mapper<Text, Writable, Text, NutchWritable>,
        Reducer<Text, NutchWritable, Text, NutchIndexAction> {

    public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
    public static final String INDEXER_DELETE = "indexer.delete";
    public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
    public static final String URL_FILTERING = "indexer.url.filters";
    public static final String URL_NORMALIZING = "indexer.url.normalizers";
    private boolean skip = false;
    private boolean delete = false;
    private IndexingFilters filters;
    private ScoringFilters scfilters;
    // using normalizers and/or filters
    private boolean normalize = false;
    private boolean filter = false;
    // url normalizers, filters and job configuration
    private URLNormalizers urlNormalizers;
    private URLFilters urlFilters;

    public void configure(JobConf job) {
        setConf(job);
        this.filters = new IndexingFilters(getConf());
        this.scfilters = new ScoringFilters(getConf());
        this.delete = job.getBoolean(INDEXER_DELETE, false);
        this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);

        normalize = job.getBoolean(URL_NORMALIZING, false);
        filter = job.getBoolean(URL_FILTERING, false);

        if (normalize) {
            urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
        }

        if (filter) {
            urlFilters = new URLFilters(getConf());
        }
    }

    /**
     * Normalizes and trims extra whitespace from the given url.
     *
     * @param url The url to normalize.
     *
     * @return The normalized url.
     */
    private String normalizeUrl(String url) {
        if (!normalize) {
            return url;
        }

        String normalized = null;
        if (urlNormalizers != null) {
            try {

                // normalize and trim the url
                normalized = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INDEXER);
                normalized = normalized.trim();
            } catch (Exception e) {
                LOG.warn("Skipping " + url + ":" + e);
                normalized = null;
            }
        }

        return normalized;
    }

    /**
     * Filters the given url.
     *
     * @param url The url to filter.
     *
     * @return The filtered url or null.
     */
    private String filterUrl(String url) {
        if (!filter) {
            return url;
        }

        try {
            url = urlFilters.filter(url);
        } catch (Exception e) {
            url = null;
        }

        return url;
    }

    public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> output, Reporter reporter)
            throws IOException {

        String urlString = filterUrl(normalizeUrl(key.toString()));
        if (urlString == null) {
            return;
        } else {
            key.set(urlString);
        }

        output.collect(key, new NutchWritable(value));
    }

    public void reduce(Text key, Iterator<NutchWritable> values, OutputCollector<Text, NutchIndexAction> output,
            Reporter reporter) throws IOException {
        Inlinks inlinks = null;
        CrawlDatum dbDatum = null;
        CrawlDatum fetchDatum = null;
        ParseData parseData = null;
        ParseText parseText = null;

        while (values.hasNext()) {
            final Writable value = values.next().get(); // unwrap
            if (value instanceof Inlinks) {
                inlinks = (Inlinks) value;
            } else if (value instanceof CrawlDatum) {
                final CrawlDatum datum = (CrawlDatum) value;
                if (CrawlDatum.hasDbStatus(datum)) {
                    dbDatum = datum;

                    // Whether to skip DB_NOTMODIFIED pages
                    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                        reporter.incrCounter("IndexerStatus", "Skipped", 1);
                        return;
                    }
                } else if (CrawlDatum.hasFetchStatus(datum)) {

                    // don't index unmodified (empty) pages
                    if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
                        fetchDatum = datum;

                        /**
                         * Check if we need to delete 404 NOT FOUND and 301
                         * PERMANENT REDIRECT.
                         */
                        if (delete) {
                            if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
                                reporter.incrCounter("IndexerStatus", "Documents deleted", 1);

                                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
                                output.collect(key, action);
                                return;
                            }
                            if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
                                reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);

                                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
                                output.collect(key, action);
                                return;
                            }
                        }
                    }

                } else if (CrawlDatum.STATUS_LINKED == datum.getStatus()
                        || CrawlDatum.STATUS_SIGNATURE == datum.getStatus()
                        || CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
                    continue;
                } else {
                    throw new RuntimeException("Unexpected status: " + datum.getStatus());
                }
            } else if (value instanceof ParseData) {
                parseData = (ParseData) value;
            } else if (value instanceof ParseText) {
                parseText = (ParseText) value;
            } else if (LOG.isWarnEnabled()) {
                LOG.warn("Unrecognized type: " + value.getClass());
            }
        }

        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
            return; // only have inlinks
        }

        if (!parseData.getStatus().isSuccess() || fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
            return;
        }

        NutchDocument doc = new NutchDocument();
        final Metadata metadata = parseData.getContentMeta();

        // add segment, used to map from merged index back to segment files
        doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

        // add digest, used by dedup
        doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

        final Parse parse = new ParseImpl(parseText, parseData);
        try {
            // extract information from dbDatum and pass it to
            // fetchDatum so that indexing filters can use it
            final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (url != null) {
                fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
            }
            // run indexing filters
            doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
        } catch (final IndexingException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error indexing " + key + ": " + e);
            }
            reporter.incrCounter("IndexerStatus", "Errors", 1);
            return;
        }

        // skip documents discarded by indexing filters
        if (doc == null) {
            reporter.incrCounter("IndexerStatus", "Skipped by filters", 1);
            return;
        }

        float boost = 1.0f;
        // run scoring filters
        try {
            boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost);
        } catch (final ScoringFilterException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error calculating score " + key + ": " + e);
            }
            return;
        }
        // apply boost to all indexed fields.
        doc.setWeight(boost);
        // store boost for use by explain and dedup
        doc.add("boost", Float.toString(boost));

        reporter.incrCounter("IndexerStatus", "Documents added", 1);

        NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
        output.collect(key, action);
    }

    public void close() throws IOException {
    }

    public static void initMRJob(Path inputDb, Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) {

        LOG.info("IndexerMapReduce: crawldb: " + crawlDb);

        if (linkDb != null) {
            LOG.info("IndexerMapReduce: linkdb: " + linkDb);
        }

        for (final Path segment : segments) {
            LOG.info("IndexerMapReduces: adding segment: " + segment);
            FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
            FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
            FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
            FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
        }

        FileInputFormat.addInputPath(job, new Path(inputDb, CrawlDb.CURRENT_NAME));
        FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));

        if (linkDb != null) {
            FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
        }

        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(IndexerMapReduce.class);
        job.setReducerClass(IndexerMapReduce.class);

        job.setOutputFormat(IndexerOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NutchWritable.class);
        job.setOutputValueClass(NutchWritable.class);
    }
}