org.apache.nutch.searcher.SolrBean.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.searcher.SolrBean.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.searcher;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
//import org.apache.lucene.index.Term;
//import org.apache.lucene.search.BooleanClause;
//import org.apache.lucene.search.BooleanQuery;
//import org.apache.lucene.search.TermQuery;
//import org.apache.lucene.util.ToStringUtils;
//import org.apache.nutch.indexer.solr.SolrMappingReader;
//import org.apache.nutch.indexer.solr.SolrWriter;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

public class SolrBean implements Searcher, HitDetailer, HitSummarizer {

    public static final Log LOG = LogFactory.getLog(SolrBean.class);

    private final SolrServer solr;

    //  private final QueryFilters filters;

    private String searchUID = "id";

    public SolrBean(Configuration conf, String solrServer) throws IOException {
        solr = new HttpSolrServer(solrServer);
        //    filters = new QueryFilters(conf);
        //    SolrMappingReader mapping = SolrMappingReader.getInstance(conf);
        //    searchUID = mapping.getUniqueKey();
    }

    public String getExplanation(Query query, Hit hit) throws IOException {
        return "SOLR backend does not support explanations yet.";
    }

    public static IOException makeIOException(SolrServerException e) {
        final IOException ioe = new IOException();
        ioe.initCause(e);
        return ioe;
    }

    public Hits search(Query query) throws IOException {
        // filter query string
        //    final BooleanQuery bQuery = filters.filter(query);

        //    final SolrQuery solrQuery = new SolrQuery(stringify(bQuery));
        final SolrQuery solrQuery = new SolrQuery();

        solrQuery.set("q", query.getQuery());
        String a = query.getParams().getSortField();
        solrQuery.setRows(query.getParams().getNumHits());

        if (query.getParams().getSortField() == null) {
            //      solrQuery.setFields(query.getParams().getDedupField(), "score", searchUID);
            query.getParams().setSortField("score");
        } else {
            //      solrQuery.setFields(query.getParams().getDedupField(), query
            //          .getParams().getSortField(), searchUID);
            //      solrQuery.setSortField(query.getParams().getSortField(), query
            //          .getParams().isReverse() ? ORDER.asc : ORDER.desc);

            solrQuery.setSort(query.getParams().getSortField(),
                    query.getParams().isReverse() ? ORDER.asc : ORDER.desc);
        }

        solrQuery.set("fl", "id,url,title,tstamp,type,content,segment,score");
        solrQuery.setHighlight(true);
        solrQuery.set("hl.fl", "title,content");
        solrQuery.set("hl.simple.pre", "<span class=highlight>");
        solrQuery.set("hl.simple.post", "</span>");
        solrQuery.set("defType", "edismax");
        solrQuery.set("qf", "title^4 content");

        QueryResponse response;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw makeIOException(e);
        }

        final SolrDocumentList docList = response.getResults();

        Map<String, Map<String, List<String>>> highlights = response.getHighlighting();

        int qtime = response.getQTime();

        final Hit[] hitArr = new Hit[docList.size()];
        for (int i = 0; i < hitArr.length; i++) {
            final SolrDocument solrDoc = docList.get(i);

            String url = (String) solrDoc.getFieldValue("url");
            String title = (String) solrDoc.getFieldValue("title");
            String content = (String) solrDoc.getFieldValue("content");

            final Object raw = solrDoc.getFirstValue(query.getParams().getSortField());
            WritableComparable sortValue;

            if (raw instanceof Integer) {
                sortValue = new IntWritable(((Integer) raw).intValue());
            } else if (raw instanceof Float) {
                sortValue = new FloatWritable(((Float) raw).floatValue());
            } else if (raw instanceof String) {
                sortValue = new Text((String) raw);
            } else if (raw instanceof Long) {
                sortValue = new LongWritable(((Long) raw).longValue());
            } else {
                throw new RuntimeException("Unknown sort value type!");
            }

            final String dedupValue = (String) solrDoc.getFirstValue(query.getParams().getDedupField());

            final String uniqueKey = (String) solrDoc.getFirstValue(searchUID);

            //    hitArr[i] = new Hit(uniqueKey, sortValue, dedupValue);
            SolrHit hit = new SolrHit(uniqueKey, sortValue, dedupValue);
            SolrHitDetails details = buildDetails(solrDoc);
            details.setHit(hit);
            hit.setHitDetails(details);

            hit.setTitleHighlighted(title);
            int len = (content.length() > 100 ? 100 : content.length());
            Summary.Fragment f = new Summary.Fragment(content.substring(0, len));
            Summary summary = new Summary();
            summary.add(f);
            hit.setSummary(summary);

            String titleHighlighted = "";
            if (highlights.containsKey(url)) {
                Map<String, List<String>> snippets = highlights.get(url);
                if (snippets.containsKey("title")) {
                    titleHighlighted = snippets.get("title").get(0);
                    hit.setTitleHighlighted(titleHighlighted);
                }

                if (snippets.containsKey("content")) {
                    f = new Summary.Fragment(snippets.get("content").get(0));
                    summary = new Summary();
                    summary.add(f);
                    hit.setSummary(summary);
                }
            }

            hitArr[i] = hit;
        }

        return new Hits(docList.getNumFound(), hitArr);
    }

    @SuppressWarnings("unchecked")
    @Deprecated
    public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse)
            throws IOException {
        query.getParams().setNumHits(numHits);
        query.getParams().setDedupField(dedupField);
        query.getParams().setSortField(sortField);
        query.getParams().setReverse(reverse);
        return search(query);
    }

    public HitDetails getDetails(Hit hit) throws IOException {
        //    SolrHit h = (SolrHit) hit;
        //    return h.getHitDetails();
        QueryResponse response;
        try {
            String url = hit.getUniqueKey();
            url.replace(":", "\\:");
            SolrQuery solrQuery = new SolrQuery(searchUID + ":\"" + url + "\"");
            solrQuery.set("fl", "id,url,title,tstamp,type,content,segment,score");
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw makeIOException(e);
        }

        final SolrDocumentList docList = response.getResults();
        if (docList.getNumFound() == 0) {
            return null;
        }

        return buildDetails(docList.get(0));
    }

    public HitDetails[] getDetails(Hit[] hits) throws IOException {
        HitDetails[] details = new HitDetails[hits.length];
        for (int i = 0; i < hits.length; i++) {
            //details[i] = getDetails(hits[i]);
            SolrHit h = (SolrHit) hits[i];
            details[i] = h.getHitDetails();
        }
        return details;

        //    final StringBuilder buf = new StringBuilder();
        //    buf.append("(");
        //    for (final Hit hit : hits) {
        //      buf.append(" " + searchUID + ":\"");
        //      buf.append(hit.getUniqueKey());
        //      buf.append("\"");
        //    }
        //    buf.append(")");
        //
        //    QueryResponse response;
        //    try {
        //      response = solr.query(new SolrQuery(buf.toString()));
        //    } catch (final SolrServerException e) {
        //      throw makeIOException(e);
        //    }
        //
        //    final SolrDocumentList docList = response.getResults();
        //    if (docList.size() < hits.length) {
        //      throw new RuntimeException("Missing hit details! Found: " +
        //                                 docList.size() + ", expecting: " +
        //                                 hits.length);
        //    }
        //
        //    /* Response returned from SOLR server may be out of
        //     * order. So we make sure that nth element of HitDetails[]
        //     * is the detail of nth hit.
        //     */
        //    final Map<String, HitDetails> detailsMap =
        //      new HashMap<String, HitDetails>(hits.length);
        //    for (final SolrDocument solrDoc : docList) {
        //      final HitDetails details = buildDetails(solrDoc);
        //      detailsMap.put(details.getValue(searchUID), details);
        //    }
        //
        //    final HitDetails[] detailsArr = new HitDetails[hits.length];
        //    for (int i = 0; i < hits.length; i++) {
        //      detailsArr[i] = detailsMap.get(hits[i].getUniqueKey());
        //    }
        //
        //    return detailsArr;
    }

    public boolean ping() throws IOException {
        try {
            return solr.ping().getStatus() == 0;
        } catch (final SolrServerException e) {
            throw makeIOException(e);
        }
    }

    public void close() throws IOException {
    }

    private static SolrHitDetails buildDetails(SolrDocument solrDoc) {
        final List<String> fieldList = new ArrayList<String>();
        final List<String> valueList = new ArrayList<String>();
        for (final String field : solrDoc.getFieldNames()) {
            for (final Object o : solrDoc.getFieldValues(field)) {
                fieldList.add(field);
                valueList.add(o.toString());
            }
        }

        final String[] fields = fieldList.toArray(new String[fieldList.size()]);
        final String[] values = valueList.toArray(new String[valueList.size()]);
        return new SolrHitDetails(fields, values);
    }

    //  /* Hackish solution for stringifying queries. Code from BooleanQuery.
    //   * This is necessary because a BooleanQuery.toString produces
    //   * statements like feed:http://www.google.com which doesn't work, we
    //   * need feed:"http://www.google.com".
    //   */
    //  private static String stringify(BooleanQuery bQuery) {
    //    final StringBuilder buffer = new StringBuilder();
    //    final boolean needParens=(bQuery.getBoost() != 1.0) ||
    //                       (bQuery.getMinimumNumberShouldMatch()>0) ;
    //    if (needParens) {
    //      buffer.append("(");
    //    }
    //
    //    final BooleanClause[] clauses  = bQuery.getClauses();
    //    int i = 0;
    //    for (final BooleanClause c : clauses) {
    //      if (c.isProhibited())
    //        buffer.append("-");
    //      else if (c.isRequired())
    //        buffer.append("+");
    //
    //      final org.apache.lucene.search.Query subQuery = c.getQuery();
    //      if (subQuery instanceof BooleanQuery) {   // wrap sub-bools in parens
    //        buffer.append("(");
    //        buffer.append(c.getQuery().toString(""));
    //        buffer.append(")");
    //      } else if (subQuery instanceof TermQuery) {
    //        final Term term = ((TermQuery) subQuery).getTerm();
    //        buffer.append(term.field());
    //        buffer.append(":\"");
    //        buffer.append(term.text());
    //        buffer.append("\"");
    //      } else {
    //        buffer.append(" ");
    //        buffer.append(c.getQuery().toString());
    //      }
    //
    //      if (i++ != clauses.length - 1) {
    //        buffer.append(" ");
    //      }
    //    }
    //
    //    if (needParens) {
    //      buffer.append(")");
    //    }
    //
    //    if (bQuery.getMinimumNumberShouldMatch()>0) {
    //      buffer.append('~');
    //      buffer.append(bQuery.getMinimumNumberShouldMatch());
    //    }
    //
    //    if (bQuery.getBoost() != 1.0f) {
    //      buffer.append(ToStringUtils.boost(bQuery.getBoost()));
    //    }
    //
    //    return buffer.toString();
    //  }
    //
    @Override
    public Summary getSummary(HitDetails details, Query query) throws IOException {
        SolrHitDetails hd = (SolrHitDetails) details;
        return hd.getHit().getSummary();
    }

    public String getHighlightedTitle(HitDetails details, Query query) {
        SolrHitDetails hd = (SolrHitDetails) details;
        return hd.getHit().getTitleHighlighted();
    }

    @Override
    public Summary[] getSummary(HitDetails[] details, Query query) throws IOException {
        Summary[] summaries = new Summary[details.length];
        for (int i = 0; i < details.length; i++)
            summaries[i] = getSummary(details[i], query);

        return summaries;
    }

}