net.yacy.cora.federate.opensearch.OpenSearchConnector.java Source code

Java tutorial

Introduction

Here is the source code for net.yacy.cora.federate.opensearch.OpenSearchConnector.java

Source

/**
 *  OpenSearchConnector
 *  Copyright 2012 by Michael Peter Christen
 *  First released 03.11.2012 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */
package net.yacy.cora.federate.opensearch;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.parser.xml.opensearchdescriptionReader;
import net.yacy.kelondro.blob.Tables;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.WebgraphSchema;

import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

/**
 * Handling of queries to remote OpenSearch systems. Iterates to a list of
 * configured systems until number of needed results are available. Uses a
 * temporary work table to store search template urls for the iteration during
 * search.
 */
public class OpenSearchConnector {

    private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
    private int size = 0; // remember the size of active opensearch targets

    public OpenSearchConnector(Switchboard sb, boolean createworktable) {
        super();
        if (sb == null) {
            return;
        }

        confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");

        if (createworktable) { // read from config file and create worktable
            sb.tables.clear("opensearchsys");
            try {
                Configuration cfg = new Configuration(confFile);

                // copy active opensearch systems to a work table (opensearchsys)
                Iterator<Configuration.Entry> cfgentries = cfg.entryIterator();
                while (cfgentries.hasNext()) {
                    Configuration.Entry e = cfgentries.next();
                    if (e.enabled()) {
                        String title = e.key(); // get the title
                        String urlstr = e.getValue(); // get the search template url

                        Tables.Data row = new Tables.Data();
                        row.put("title", title);
                        row.put("url", urlstr);
                        try {
                            sb.tables.insert("opensearchsys", row);
                        } catch (final SpaceExceededException ex) {
                            ConcurrentLog.logException(ex);
                        }
                    }
                }
                size = sb.tables.size("opensearchsys");
            } catch (final IOException ex) {
                ConcurrentLog.logException(ex);
            }
        }
    }

    /**
     * Sends a search request to remote systems listed in worktable until the
     * searchevent contains less than needed results. Depending on already
     * collected search results none to all configured systems are queried to
     * complete available search results.
     * if query search domain is LOCAL procedure does nothing.
     */
    static public void query(Switchboard sb, SearchEvent theSearch) {
        if (theSearch != null && sb != null) {
            if (!theSearch.query.isLocal()) {
                try {
                    Iterator<Tables.Row> ossysworktable = sb.tables.iterator("opensearchsys");
                    //int needres = theSearch.query.neededResults(); // get number of needed results
                    while (ossysworktable.hasNext() /*&& theSearch.query.getResultCount() < needres*/) {
                        Tables.Row row = ossysworktable.next();
                        String osurl = row.get("url", "");
                        String name = row.get("title", "");
                        sb.heuristicRSS(
                                parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0,
                                        theSearch.query.itemsPerPage),
                                theSearch, name);
                    }
                } catch (final IOException ex) {
                    ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys");
                }
            }
        }
    }

    /**
     * replace Opensearchdescription search template parameter with actual values
     */
    private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
        String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}='
        tmps = tmps.replace("{startIndex}", Integer.toString(start));
        tmps = tmps.replace("{startPage}", "");
        tmps = tmps.replace("{count}", Integer.toString(rows));
        tmps = tmps.replace("{language}", "");
        tmps = tmps.replace("{inputEncoding}", "UTF-8");
        tmps = tmps.replace("{outputEncoding}", "UTF-8");
        return tmps.replace("{searchTerms}", query);
    }

    /**
     * add a opensearch target system to the config file
     */
    public boolean add(String name, String url, boolean active, String comment) {
        if (confFile == null) {
            return false;
        }

        try {
            Configuration conf = new Configuration(confFile);
            if (name != null && !name.isEmpty()) {
                conf.add(name, null, active);
                Configuration.Entry e = conf.get(name);
                e.setValue(url);
                e.setEnable(active);
                e.setComment(comment);
                conf.put(name, e);
                try {
                    conf.commit();
                } catch (final IOException ex) {
                    ConcurrentLog.warn("OpenSearchConnector.add", "config file write error");
                }
                return true;
            }
        } catch (final IOException e1) {
            ConcurrentLog.logException(e1);
            return false;
        }
        return false;
    }

    /**
     * Get the number of active remote opensearch target systems
     */
    public int getSize() {
        return size;
    }

    /**
     * Discover opensearch description links from local (embedded) Solr index using
     * meta data field 'outboundlinks_tag_txt' and add found systems to the
     * config file
     *  
     * @return true if background discover job was started, false if job not started
     */
    public boolean discoverFromSolrIndex(final Switchboard sb) {
        if (sb == null) {
            return false;
        }
        // check if needed Solr fields are available (selected)
        if (!sb.index.fulltext().useWebgraph()) {
            ConcurrentLog.severe("OpenSearchConnector.Discover",
                    "Error on connecting to embedded Solr webgraph index");
            return false;
        }
        final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
        final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration()
                .contains(WebgraphSchema.target_rel_s.name())
                && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name())
                        && sb.index.fulltext().getWebgraphConfiguration()
                                .contains(WebgraphSchema.target_urlstub_s.name()))
                && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
        if (!metafieldavailable) {
            ConcurrentLog.warn("OpenSearchConnector.Discover",
                    "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
            return false;
        }
        // the solr query
        final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
        final String[] webgraphqueryfields = { WebgraphSchema.target_protocol_s.getSolrFieldName(),
                WebgraphSchema.target_urlstub_s.getSolrFieldName() };
        // alternatively target_protocol_s + "://" +target_host_s + target_path_s

        final long numfound;
        try {
            SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1,
                    webgraphqueryfields);
            numfound = docList.getNumFound();
            if (numfound == 0) {
                ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job");
                return true;
            }
            ConcurrentLog.info("OpenSearchConnector.Discover",
                    "start checking " + Long.toString(numfound) + " found index results");
        } catch (final IOException ex) {
            ConcurrentLog.logException(ex);
            return false;
        }

        final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever

        // job to iterate through Solr index to find links to opensearchdescriptions
        // started as background job as connect timeouts may cause it run a long time
        final Thread job = new Thread() {
            @Override
            public void run() {
                try {
                    boolean doloop = true;
                    int loopnr = 0;
                    Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
                    while (doloop) {
                        ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at "
                                + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
                        SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null,
                                loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents
                        loopnr++;
                        if (stoptime < System.currentTimeMillis()) {// stop after max 1h
                            doloop = false;
                            ConcurrentLog.info("OpenSearchConnector.Discover",
                                    "long running discover task aborted");
                        }
                        if (docList != null && docList.size() > 0) {
                            Iterator<SolrDocument> docidx = docList.iterator();
                            while (docidx.hasNext()) {
                                SolrDocument sdoc = docidx.next();

                                String hrefurltxt = sdoc
                                        .getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://"
                                        + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
                                try {
                                    URL url = new URL(hrefurltxt);
                                    //TODO: check Blacklist
                                    if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
                                        opensearchdescriptionReader os = new opensearchdescriptionReader(
                                                hrefurltxt);
                                        if (os.getRSSorAtomUrl() != null) {
                                            // add found system to config file
                                            add(os.getShortName(), os.getRSSorAtomUrl(), false,
                                                    os.getItem("LongName"));
                                            ConcurrentLog.info("OpenSearchConnector.Discover",
                                                    "added " + os.getShortName() + " " + hrefurltxt);
                                        } else {
                                            ConcurrentLog.info("OpenSearchConnector.Discover",
                                                    "osd.xml check failed (no RSS or Atom support) for "
                                                            + hrefurltxt);
                                        }
                                    }
                                } catch (final MalformedURLException ex) {
                                }
                            }
                        } else {
                            doloop = false;
                        }
                    }
                    ConcurrentLog.info("OpenSearchConnector.Discover",
                            "finisched Solr query (checked " + Integer.toString(dblmem.size())
                                    + " unique opensearchdescription links found in " + Long.toString(numfound)
                                    + " results)");
                } catch (final IOException ex) {
                    ConcurrentLog.logException(ex);
                }
            }
        };
        job.start();
        return true;
    }
}