org.apache.nutchbase.searcher.NutchBeanHbase.java Source code

Introduction

Here is the source code for org.apache.nutchbase.searcher.NutchBeanHbase.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutchbase.searcher;

import java.io.*;
import java.util.*;
import java.util.concurrent.Future;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
import java.util.concurrent.ExecutorService;

import javax.servlet.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;

import org.apache.nutch.searcher.*;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutchbase.util.hbase.RowPart;
import org.apache.nutchbase.util.hbase.ImmutableRowPart;
import org.apache.nutchbase.util.hbase.TableUtil;
import org.apache.nutchbase.util.hbase.TableColumns;

/**
 * One stop shopping for search-related functionality.
 *
 * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
 */
public class NutchBeanHbase implements SearchBean, HitSummarizer, HitInlinks, Closeable {

    public static final Log LOG = LogFactory.getLog(NutchBean.class);
    public static final String KEY = "nutchBean";

    //  static {
    //    LogFormatter.setShowThreadIDs(true);
    //  }

    private final Summarizer summarizer;
    private SearchBean searchBean;

    private HTable table;

    /**
     * BooleanQuery won't permit more than 32 required/prohibited clauses.  We
     * don't want to use too many of those.
     */
    private static final int MAX_PROHIBITED_TERMS = 20;

    private final Configuration conf;

    private final FileSystem fs;

    /** Returns the cached instance in the servlet context.
     * @see NutchBeanConstructor*/
    public static NutchBeanHbase get(ServletContext app, Configuration conf) throws IOException {
        final NutchBeanHbase bean = (NutchBeanHbase) app.getAttribute(KEY);
        return bean;
    }

    /**
     * @param conf
     * @throws IOException
     */
    public NutchBeanHbase(Configuration conf, String tablename) throws IOException {
        this(conf, null, tablename);
    }

    /**
     * Construct in a named directory.
     *
     * @param conf
     * @param dir
     * @throws IOException
     */
    public NutchBeanHbase(Configuration conf, Path dir, String tablename) throws IOException {
        if (tablename == null || "".equals(tablename)) {
            throw new java.lang.IllegalArgumentException("tablename cannot be null or empty!");
        }
        this.conf = conf;
        this.summarizer = new SummarizerFactory(this.conf).getSummarizer();
        this.fs = FileSystem.get(this.conf);
        if (dir == null) {
            dir = new Path(this.conf.get("searcher.dir", "crawl"));
        }
        final Path luceneConfig = new Path(dir, "search-servers.txt");
        final Path solrConfig = new Path(dir, "solr-servers.txt");

        if (fs.exists(luceneConfig) || fs.exists(solrConfig)) {
            searchBean = new DistributedSearchBean(conf, luceneConfig, solrConfig);
        } else {
            final Path indexDir = new Path(dir, "index_merged");
            final Path indexesDir = new Path(dir, "index");
            searchBean = new LuceneSearchBean(conf, indexDir, indexesDir);
        }
        table = new HTable(tablename);
    }

    public Hits search(Query query, int numHits) throws IOException {
        return search(query, numHits, null, null, false);
    }

    public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse)
            throws IOException {

        return searchBean.search(query, numHits, dedupField, sortField, reverse);
    }

    @Override
    public boolean ping() throws IOException {
        return true;
    }

    @SuppressWarnings("serial")
    private class DupHits extends ArrayList<Hit> {
        private boolean maxSizeExceeded;
    }

    /**
     * Search for pages matching a query, eliminating excessive hits from the
     * same site.  Hits after the first <code>maxHitsPerDup</code> from the same
     * site are removed from results.  The remaining hits have {@link
     * Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero then all
     * hits are returned.
     *
     * @param query         query
     * @param numHits       number of requested hits
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException {
        return search(query, numHits, maxHitsPerDup, "site", null, false);
    }

    /**
     * Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field.  Hits after the first
     * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
     * then all hits are returned.
     *
     * @param query         query
     * @param numHits       number of requested hits
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @param dedupField    field name to check for duplicates
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException {
        return search(query, numHits, maxHitsPerDup, dedupField, null, false);
    }

    /**
     * Search for pages matching a query, eliminating excessive hits with
     * matching values for a named field.  Hits after the first
     * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
     * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
     * then all hits are returned.
     *
     * @param query         query
     * @param numHits       number of requested hits
     * @param maxHitsPerDup the maximum hits returned with matching values, or zero
     * @param dedupField    field name to check for duplicates
     * @param sortField     Field to sort on (or null if no sorting).
     * @param reverse       True if we are to reverse sort by <code>sortField</code>.
     * @return Hits the matching hits
     * @throws IOException
     */
    public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField,
            boolean reverse) throws IOException {
        if (maxHitsPerDup <= 0) // disable dup checking
            return search(query, numHits, dedupField, sortField, reverse);

        final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
        int numHitsRaw = (int) (numHits * rawHitsFactor);
        if (LOG.isInfoEnabled()) {
            LOG.info("searching for " + numHitsRaw + " raw hits");
        }
        Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse);
        final long total = hits.getTotal();
        final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
        final List<Hit> resultList = new ArrayList<Hit>();
        final Set<Hit> seen = new HashSet<Hit>();
        final List<String> excludedValues = new ArrayList<String>();
        boolean totalIsExact = true;
        for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
            // get the next raw hit
            if (rawHitNum >= hits.getLength()) {
                // optimize query by prohibiting more matches on some excluded values
                final Query optQuery = (Query) query.clone();
                for (int i = 0; i < excludedValues.size(); i++) {
                    if (i == MAX_PROHIBITED_TERMS)
                        break;
                    optQuery.addProhibitedTerm(excludedValues.get(i), dedupField);
                }
                numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
                if (LOG.isInfoEnabled()) {
                    LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
                }
                hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse);
                if (LOG.isInfoEnabled()) {
                    LOG.info("found " + hits.getTotal() + " raw hits");
                }
                rawHitNum = -1;
                continue;
            }

            final Hit hit = hits.getHit(rawHitNum);
            if (seen.contains(hit))
                continue;
            seen.add(hit);

            // get dup hits for its value
            final String value = hit.getDedupValue();
            DupHits dupHits = dupToHits.get(value);
            if (dupHits == null)
                dupToHits.put(value, dupHits = new DupHits());

            // does this hit exceed maxHitsPerDup?
            if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
                if (!dupHits.maxSizeExceeded) {

                    // mark prior hits with moreFromDupExcluded
                    for (int i = 0; i < dupHits.size(); i++) {
                        dupHits.get(i).setMoreFromDupExcluded(true);
                    }
                    dupHits.maxSizeExceeded = true;

                    excludedValues.add(value); // exclude dup
                }
                totalIsExact = false;
            } else { // no -- collect the hit
                resultList.add(hit);
                dupHits.add(hit);

                // are we done?
                // we need to find one more than asked for, so that we can tell if
                // there are more hits to be shown
                if (resultList.size() > numHits)
                    break;
            }
        }

        final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()]));
        results.setTotalIsExact(totalIsExact);
        return results;
    }

    @Override
    public String[] getAnchors(HitDetails hitDetails) throws IOException {
        return getInlinks(hitDetails).getAnchors();
    }

    @Override
    public Inlinks getInlinks(HitDetails hitDetails) throws IOException {
        ImmutableRowPart row = getRow(hitDetails);
        Inlinks inLinks = new Inlinks();
        for (Inlink inlink : row.getInlinks()) {
            inLinks.add(inlink);
        }
        return inLinks;
    }

    private static final ExecutorService executor = Executors.newCachedThreadPool();

    private class SummaryTask implements Callable<Summary> {
        private final HitDetails details;
        private final Query query;

        public SummaryTask(HitDetails details, Query query) {
            this.details = details;
            this.query = query;
        }

        public Summary call() throws Exception {
            return getSummary(details, query);
        }
    }

    @Override
    public Summary getSummary(HitDetails hitDetails, Query query) throws IOException {
        ImmutableRowPart row = getRow(hitDetails);
        return this.summarizer.getSummary(row.getText(), query);
    }

    @Override
    public Summary[] getSummary(HitDetails[] details, Query query) throws IOException {
        final List<Callable<Summary>> tasks = new ArrayList<Callable<Summary>>(details.length);
        for (int i = 0; i < details.length; i++) {
            tasks.add(new SummaryTask(details[i], query));
        }

        List<Future<Summary>> summaries;
        try {
            summaries = executor.invokeAll(tasks);
        } catch (final InterruptedException e) {
            throw new RuntimeException(e);
        }

        final Summary[] results = new Summary[details.length];
        for (int i = 0; i < details.length; i++) {
            final Future<Summary> f = summaries.get(i);
            Summary summary;
            try {
                summary = f.get();
            } catch (final Exception e) {
                if (e.getCause() instanceof IOException) {
                    throw (IOException) e.getCause();
                }
                throw new RuntimeException(e);
            }
            results[i] = summary;
        }
        return results;
    }

    public String getExplanation(Query query, Hit hit) throws IOException {
        return searchBean.getExplanation(query, hit);
    }

    public HitDetails getDetails(Hit hit) throws IOException {
        return searchBean.getDetails(hit);
    }

    public HitDetails[] getDetails(Hit[] hits) throws IOException {
        return searchBean.getDetails(hits);
    }

    public ImmutableRowPart getRow(HitDetails details) throws IOException {
        String url = details.getValue("url");
        String rowKey = TableUtil.reverseUrl(url);
        return new ImmutableRowPart(table.getRow(Bytes.toBytes(rowKey),
                new byte[][] { TableColumns.TEXT, TableColumns.CONTENT, TableColumns.PAGERANK, TableColumns.VOTES,
                        TableColumns.INLINKS, TableColumns.CONTENT_TYPE }));
    }

    public void close() throws IOException {
        if (searchBean != null) {
            searchBean.close();
        }
        if (fs != null) {
            fs.close();
        }
    }

    /**
     * For debugging.
     */
    public static void main(String[] args) throws Exception {
        final String usage = "NutchBean webtable query";

        if (args.length != 2) {
            System.err.println(usage);
            System.exit(-1);
        }

        final Configuration conf = NutchConfiguration.create();
        final String webtable = args[0];
        final NutchBeanHbase bean = new NutchBeanHbase(conf, webtable);
        final Query query = Query.parse(args[1], conf);
        final Hits hits = bean.search(query, 10);
        System.out.println("Total hits: " + hits.getTotal());
        final int length = (int) Math.min(hits.getTotal(), 10);
        final Hit[] show = hits.getHits(0, length);
        final HitDetails[] details = bean.getDetails(show);
        final Summary[] summaries = bean.getSummary(details, query);

        for (int i = 0; i < hits.getLength(); i++) {
            System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
        }
    }

    /**
     * Responsible for constructing a NutchBean singleton instance and
     * caching it in the servlet context. This class should be registered in
     * the deployment descriptor as a listener
     */
    public static class NutchBeanConstructor implements ServletContextListener {

        public void contextDestroyed(ServletContextEvent sce) {
        }

        public void contextInitialized(ServletContextEvent sce) {
            final ServletContext app = sce.getServletContext();
            final Configuration conf = NutchConfiguration.get(app);

            final String tablename = (String) app.getInitParameter("org.apache.nutchbase.webtable");

            LOG.info("creating new bean");
            NutchBeanHbase bean = null;
            try {
                bean = new NutchBeanHbase(conf, tablename);
                app.setAttribute(KEY, bean);
            } catch (final IOException ex) {
                LOG.error(StringUtils.stringifyException(ex));
            }
        }
    }

}