org.apache.nutch.searcher.NutchBean.java Source code

Introduction

Here is the source code for org.apache.nutch.searcher.NutchBean.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.searcher;

import java.io.*;
import java.net.InetSocketAddress;
import java.util.*;

import javax.servlet.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.parse.*;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.util.NutchConfiguration;

/**
 * One stop shopping for search-related functionality.
 * 
 * @version $Id: NutchBean.java 998587 2010-09-19 04:47:40Z mattmann $
 */
public class NutchBean implements SearchBean, RPCSearchBean, SegmentBean, RPCSegmentBean, HitInlinks, Closeable {

    public static final Log LOG = LogFactory.getLog(NutchBean.class);
    public static final String KEY = "nutchBean";

    // static {
    // LogFormatter.setShowThreadIDs(true);
    // }

    private SearchBean searchBean;
    private SegmentBean segmentBean;
    private final HitInlinks linkDb;

    /**
     * BooleanQuery won't permit more than 32 required/prohibited clauses. We
     * don't want to use too many of those.
     */
    private static final int MAX_PROHIBITED_TERMS = 20;

    // don't let the optimize fall into an infinite loop
    private static final int MAX_OPTIMIZE_LOOPS = 3;

    private final Configuration conf;

    private final FileSystem fs;

    /**
     * Returns the cached instance in the servlet context.
     * 
     * @see NutchBeanConstructor
     */
    public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
        final NutchBean bean = (NutchBean) app.getAttribute(KEY);
        return bean;
    }

    /**
     * 
     * @param conf
     * @throws IOException
     */
    public NutchBean(Configuration conf) throws IOException {
        this(conf, null);
    }

    /**
     * Construct in a named directory.
     * 
     * @param conf
     * @param dir
     * @throws IOException
     */
    public NutchBean(Configuration conf, Path dir) throws IOException {
        this.conf = conf;
        this.fs = FileSystem.get(this.conf);
        if (dir == null) {
            dir = new Path(this.conf.get("searcher.dir", "crawl"));
        }
        final Path luceneConfig = new Path(dir, "search-servers.txt");
        final Path solrConfig = new Path(dir, "solr-servers.txt");
        final Path segmentConfig = new Path(dir, "segment-servers.txt");

        if (fs.exists(luceneConfig) || fs.exists(solrConfig)) {
            searchBean = new DistributedSearchBean(conf, luceneConfig, solrConfig);
        } else {
            final Path indexDir = new Path(dir, "index");
            final Path indexesDir = new Path(dir, "indexes");
            searchBean = new LuceneSearchBean(conf, indexDir, indexesDir);
        }

        if (fs.exists(segmentConfig)) {
            segmentBean = new DistributedSegmentBean(conf, segmentConfig);
        } else if (fs.exists(luceneConfig)) {
            segmentBean = new DistributedSegmentBean(conf, luceneConfig);
        } else {
            segmentBean = new FetchedSegments(conf, new Path(dir, "segments"));
        }

        linkDb = new LinkDbInlinks(fs, new Path(dir, "linkdb"), conf);
    }

    public static List<InetSocketAddress> readAddresses(Path path, Configuration conf) throws IOException {
        final List<InetSocketAddress> addrs = new ArrayList<InetSocketAddress>();
        for (final String line : readConfig(path, conf)) {
            final StringTokenizer tokens = new StringTokenizer(line);
            if (tokens.hasMoreTokens()) {
                final String host = tokens.nextToken();
                if (tokens.hasMoreTokens()) {
                    final String port = tokens.nextToken();
                    addrs.add(new InetSocketAddress(host, Integer.parseInt(port)));
                }
            }
        }
        return addrs;
    }

    public static List<String> readConfig(Path path, Configuration conf) throws IOException {
        final FileSystem fs = FileSystem.get(conf);
        final BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path)));
        try {
            final ArrayList<String> addrs = new ArrayList<String>();
            String line;
            while ((line = reader.readLine()) != null) {
                addrs.add(line);
            }
            return addrs;
        } finally {
            reader.close();
        }
    }

    public String[] getSegmentNames() throws IOException {
        return segmentBean.getSegmentNames();
    }

    /**
     * @deprecated since 1.1, use {@link #search(Query)} instead
     */
    public Hits search(Query query, int numHits) throws IOException {
        return search(query, numHits, null, null, false);
    }

    /**
     * @deprecated since 1.1, use {@link #search(Query)} instead
     */
    public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse)
            throws IOException {

        query.getParams().initFrom(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse);
        return search(query);
    }

    public Hits search(Query query) throws IOException {
        if (query.getParams().getMaxHitsPerDup() <= 0) // disable dup checking
            return searchBean.search(query);

        final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
        int numHitsRaw = (int) (query.getParams().getNumHits() * rawHitsFactor);
        if (LOG.isInfoEnabled()) {
            LOG.info("searching for " + numHitsRaw + " raw hits");
        }
        Hits hits = searchBean.search(query);
        final long total = hits.getTotal();
        final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
        final List<Hit> resultList = new ArrayList<Hit>();
        final Set<Hit> seen = new HashSet<Hit>();
        final List<String> excludedValues = new ArrayList<String>();
        boolean totalIsExact = true;
        int optimizeNum = 0;

        for (int rawHitNum = 0; rawHitNum < hits.getLength(); rawHitNum++) {
            // get the next raw hit
            if (rawHitNum == (hits.getLength() - 1) && (optimizeNum < MAX_OPTIMIZE_LOOPS)) {

                // increment the loop
                optimizeNum++;

                // optimize query by prohibiting more matches on some excluded
                // values
                final Query optQuery = (Query) query.clone();
                for (int i = 0; i < excludedValues.size(); i++) {
                    if (i == MAX_PROHIBITED_TERMS)
                        break;
                    optQuery.addProhibitedTerm(excludedValues.get(i), query.getParams().getDedupField());
                }
                numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
                if (LOG.isInfoEnabled()) {
                    LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
                }
                hits = searchBean.search(optQuery);
                if (LOG.isInfoEnabled()) {
                    LOG.info("found " + hits.getTotal() + " raw hits");
                }
                rawHitNum = -1;
                continue;
            }

            final Hit hit = hits.getHit(rawHitNum);
            if (seen.contains(hit))
                continue;
            seen.add(hit);

            // get dup hits for its value
            final String value = hit.getDedupValue();
            DupHits dupHits = dupToHits.get(value);
            if (dupHits == null)
                dupToHits.put(value, dupHits = new DupHits());

            // does this hit exceed maxHitsPerDup?
            if (dupHits.size() == query.getParams().getMaxHitsPerDup()) { // yes
                // --
                // ignore
                // the
                // hit
                if (!dupHits.maxSizeExceeded) {

                    // mark prior hits with moreFromDupExcluded
                    for (int i = 0; i < dupHits.size(); i++) {
                        dupHits.get(i).setMoreFromDupExcluded(true);
                    }
                    dupHits.maxSizeExceeded = true;

                    excludedValues.add(value); // exclude dup
                }
                totalIsExact = false;
            } else { // no -- collect the hit
                resultList.add(hit);
                dupHits.add(hit);

                // are we done?
                // we need to find one more than asked for, so that we can tell
                // if
                // there are more hits to be shown
                if (resultList.size() > query.getParams().getNumHits())
                    break;
            }
        }

        final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()]));
        results.setTotalIsExact(totalIsExact);
        return results;
    }

    @SuppressWarnings("serial")
    private class DupHits extends ArrayList<Hit> {
        private boolean maxSizeExceeded;
    }

    /**
     * Search for pages matching a query, eliminating excessive hits from the
     * same site. Hits after the first <code>maxHitsPerDup</code> from the
     * same site are removed from results. The remaining hits have {@link
     * Hit#moreFromDupExcluded()} set.
     * <p>
     * If maxHitsPerDup is zero then all hits are returned.
     * 
     * @param query
     *            query
     * @param numHits
     *            number of requested hits
     * @param maxHitsPerDup
     *            the maximum hits returned with matching values, or zero
     * @return Hits the matching hits
     * @throws IOException
     * @deprecated since 1.1, use {@link #search(Query)} instead
     * 
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException {
        return search(query, numHits, maxHitsPerDup, "site", null, false);
    }

    /**
     * Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field. Hits after the first
     * <code>maxHitsPerDup</code> are removed from results. The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.
     * <p>
     * If maxHitsPerDup is zero then all hits are returned.
     * 
     * @param query
     *            query
     * @param numHits
     *            number of requested hits
     * @param maxHitsPerDup
     *            the maximum hits returned with matching values, or zero
     * @param dedupField
     *            field name to check for duplicates
     * @return Hits the matching hits
     * @throws IOException
     * @deprecated since 1.1, use {@link #search(Query)} instead
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException {
        return search(query, numHits, maxHitsPerDup, dedupField, null, false);
    }

    /**
     * Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field. Hits after the first
     * <code>maxHitsPerDup</code> are removed from results. The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.
     * <p>
     * If maxHitsPerDup is zero then all hits are returned.
     * 
     * @param query
     *            query
     * @param numHits
     *            number of requested hits
     * @param maxHitsPerDup
     *            the maximum hits returned with matching values, or zero
     * @param dedupField
     *            field name to check for duplicates
     * @param sortField
     *            Field to sort on (or null if no sorting).
     * @param reverse
     *            True if we are to reverse sort by <code>sortField</code>.
     * @return Hits the matching hits
     * @throws IOException
     * @deprecated since 1.1, use {@link #search(Query)} instead
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField,
            boolean reverse) throws IOException {
        query.setParams(new QueryParams(numHits, maxHitsPerDup, dedupField, sortField, reverse));
        return search(query);
    }

    public String getExplanation(Query query, Hit hit) throws IOException {
        return searchBean.getExplanation(query, hit);
    }

    public HitDetails getDetails(Hit hit) throws IOException {
        return searchBean.getDetails(hit);
    }

    public HitDetails[] getDetails(Hit[] hits) throws IOException {
        return searchBean.getDetails(hits);
    }

    public Summary getSummary(HitDetails hit, Query query) throws IOException {
        return segmentBean.getSummary(hit, query);
    }

    public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException {
        return segmentBean.getSummary(hits, query);
    }

    public byte[] getContent(HitDetails hit) throws IOException {
        return segmentBean.getContent(hit);
    }

    public ParseData getParseData(HitDetails hit) throws IOException {
        return segmentBean.getParseData(hit);
    }

    public ParseText getParseText(HitDetails hit) throws IOException {
        return segmentBean.getParseText(hit);
    }

    public String[] getAnchors(HitDetails hit) throws IOException {
        return linkDb.getAnchors(hit);
    }

    public Inlinks getInlinks(HitDetails hit) throws IOException {
        return linkDb.getInlinks(hit);
    }

    public long getFetchDate(HitDetails hit) throws IOException {
        return segmentBean.getFetchDate(hit);
    }

    public void close() throws IOException {
        if (searchBean != null) {
            searchBean.close();
        }
        if (segmentBean != null) {
            segmentBean.close();
        }
        if (linkDb != null) {
            linkDb.close();
        }
        if (fs != null) {
            fs.close();
        }
    }

    public boolean ping() {
        return true;
    }

    /** For debugging. */
    public static void main(String[] args) throws Exception {
        final String usage = "NutchBean query [<searcher.dir>]";

        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }

        final Configuration conf = NutchConfiguration.create();
        if (args.length > 1) {
            conf.set("searcher.dir", args[1]);
        }
        final NutchBean bean = new NutchBean(conf);
        try {
            final Query query = Query.parse(args[0], conf);
            query.getParams().setMaxHitsPerDup(0);
            final Hits hits = bean.search(query);
            System.out.println("Total hits: " + hits.getTotal());
            final int length = (int) Math.min(hits.getLength(), 10);
            final Hit[] show = hits.getHits(0, length);
            final HitDetails[] details = bean.getDetails(show);
            final Summary[] summaries = bean.getSummary(details, query);

            for (int i = 0; i < hits.getLength(); i++) {
                System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
            }
        } catch (Exception e) {
            e.printStackTrace();
            // LOG.error("Exception occured while executing search: " + t, t);
            // System.exit(1);
        }
        System.exit(0);
    }

    public long getProtocolVersion(String className, long clientVersion) throws IOException {
        if (RPCSearchBean.class.getName().equals(className) && searchBean instanceof RPCSearchBean) {

            final RPCSearchBean rpcBean = (RPCSearchBean) searchBean;
            return rpcBean.getProtocolVersion(className, clientVersion);
        } else if (RPCSegmentBean.class.getName().equals(className) && segmentBean instanceof RPCSegmentBean) {

            final RPCSegmentBean rpcBean = (RPCSegmentBean) segmentBean;
            return rpcBean.getProtocolVersion(className, clientVersion);
        } else {
            throw new IOException("Unknown Protocol classname:" + className);
        }
    }

    /**
     * Responsible for constructing a NutchBean singleton instance and caching
     * it in the servlet context. This class should be registered in the
     * deployment descriptor as a listener
     */
    public static class NutchBeanConstructor implements ServletContextListener {

        public void contextDestroyed(ServletContextEvent sce) {
            final ServletContext context = sce.getServletContext();

            LOG.info("Closing Bean");
            try {
                Object tmp = context.getAttribute(NutchBean.KEY);

                if (tmp instanceof NutchBean) {
                    NutchBean bean = (NutchBean) tmp;
                    bean.close();
                } else {
                    LOG.warn("No bean configured, or the wrong type?  Potential PermGen leak, or startup problem.");
                }
            } catch (final IOException ex) {
                LOG.error(StringUtils.stringifyException(ex));
            }
        }

        public void contextInitialized(ServletContextEvent sce) {
            final ServletContext app = sce.getServletContext();
            final Configuration conf = NutchConfiguration.get(app);

            LOG.info("creating new bean");
            NutchBean bean = null;
            try {
                bean = new NutchBean(conf);
                app.setAttribute(KEY, bean);
            } catch (final IOException ex) {
                LOG.error(StringUtils.stringifyException(ex));
            }
        }
    }

}