org.loklak.api.server.SearchServlet.java Source code

Introduction

Here is the source code for org.loklak.api.server.SearchServlet.java
Source

/**
 *  SearchServlet
 *  Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *  
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *  
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.api.server;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.eclipse.jetty.util.log.Log;
import org.loklak.data.DAO;
import org.loklak.harvester.TwitterScraper;
import org.loklak.http.RemoteAccess;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.QueryEntry;
import org.loklak.objects.Timeline;
import org.loklak.objects.UserEntry;
import org.loklak.rss.RSSFeed;
import org.loklak.rss.RSSMessage;
import org.loklak.tools.CharacterCoding;
import org.loklak.tools.UTF8;

import com.fasterxml.jackson.databind.ObjectMapper;

/**
 * The search servlet. we provide opensearch/rss and twitter-like JSON as result.
 */
public class SearchServlet extends HttpServlet {

    private static final long serialVersionUID = 563533152152063908L;

    @Override
    protected void doPost(final HttpServletRequest request, final HttpServletResponse response)
            throws ServletException, IOException {
        doGet(request, response);
    }

    @Override
    protected void doGet(final HttpServletRequest request, final HttpServletResponse response)
            throws ServletException, IOException {
        final RemoteAccess.Post post = RemoteAccess.evaluate(request);
        try {

            // manage DoS
            if (post.isDoS_blackout()) {
                response.sendError(503, "your (" + post.getClientHost() + ") request frequency is too high");
                return;
            }

            // check call type
            boolean jsonExt = request.getServletPath().endsWith(".json");
            boolean rssExt = request.getServletPath().endsWith(".rss");
            boolean txtExt = request.getServletPath().endsWith(".txt");

            // evaluate get parameter
            String callback = post.get("callback", "");
            boolean jsonp = callback != null && callback.length() > 0;
            boolean minified = post.get("minified", false);
            String query = post.get("q", "");
            if (query == null || query.length() == 0)
                query = post.get("query", "");
            query = CharacterCoding.html2unicode(query).replaceAll("\\+", " ");
            final long timeout = (long) post.get("timeout", DAO.getConfig("search.timeout", 2000));
            final int count = post.isDoS_servicereduction() ? 10
                    : Math.min(post.get("count", post.get("maximumRecords", 100)),
                            post.isLocalhostAccess() ? 10000 : 1000);
            String source = post.isDoS_servicereduction() ? "cache" : post.get("source", "all"); // possible values: cache, backend, twitter, all
            int limit = post.get("limit", 100);
            String[] fields = post.get("fields", new String[0], ",");
            int timezoneOffset = post.get("timezoneOffset", 0);
            if (query.indexOf("id:") >= 0 && ("all".equals(source) || "twitter".equals(source)))
                source = "cache"; // id's cannot be retrieved from twitter with the scrape-api (yet), only from the cache
            final String ordername = post.get("order", Timeline.Order.CREATED_AT.getMessageFieldName());
            final Timeline.Order order = Timeline.parseOrder(ordername);

            // create tweet timeline
            final Timeline tl = new Timeline(order);
            Map<String, List<Map.Entry<String, Long>>> aggregations = null;
            final QueryEntry.Tokens tokens = new QueryEntry.Tokens(query);

            final AtomicInteger cache_hits = new AtomicInteger(0), count_backend = new AtomicInteger(0),
                    count_twitter_all = new AtomicInteger(0), count_twitter_new = new AtomicInteger(0);
            final boolean backend_push = DAO.getConfig("backend.push.enabled", false);

            if ("all".equals(source)) {
                // start all targets for search concurrently
                final int timezoneOffsetf = timezoneOffset;
                final String queryf = query;
                final long start = System.currentTimeMillis();

                // start a scraper
                Thread scraperThread = tokens.raw.length() == 0 ? null : new Thread() {
                    public void run() {
                        final String scraper_query = tokens.translate4scraper();
                        DAO.log(request.getServletPath() + " scraping with query: " + scraper_query);
                        Timeline twitterTl = DAO.scrapeTwitter(post, scraper_query, order, timezoneOffsetf, true,
                                timeout, true);
                        count_twitter_new.set(twitterTl.size());
                        tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more 
                        tl.setScraperInfo(twitterTl.getScraperInfo());
                        post.recordEvent("twitterscraper_time", System.currentTimeMillis() - start);
                    }
                };
                if (scraperThread != null)
                    scraperThread.start();

                // start a local search
                Thread localThread = queryf == null || queryf.length() == 0 ? null : new Thread() {
                    public void run() {
                        DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(queryf, order,
                                timezoneOffsetf, count, 0);
                        post.recordEvent("cache_time", System.currentTimeMillis() - start);
                        cache_hits.set(localSearchResult.timeline.getHits());
                        tl.putAll(localSearchResult.timeline);
                    }
                };
                if (localThread != null)
                    localThread.start();

                // start a backend search, but only if backend_push == true or result from scraper is too bad
                boolean start_backend_thread = false;
                if (backend_push)
                    start_backend_thread = true;
                else {
                    // wait now for termination of scraper thread and local search
                    // to evaluate how many results are available
                    if (scraperThread != null)
                        try {
                            scraperThread.join(Math.max(10000, timeout - System.currentTimeMillis() + start));
                        } catch (InterruptedException e) {
                        }
                    if (localThread != null)
                        try {
                            localThread.join(Math.max(100, timeout - System.currentTimeMillis() + start));
                        } catch (InterruptedException e) {
                        }
                    localThread = null;
                    scraperThread = null;
                    if (tl.size() < count)
                        start_backend_thread = true;
                }
                Thread backendThread = tokens.original.length() == 0 || !start_backend_thread ? null
                        : new Thread() {
                            public void run() {
                                Timeline backendTl = DAO.searchBackend(tokens.original, order, count,
                                        timezoneOffsetf, "cache", timeout);
                                if (backendTl != null) {
                                    tl.putAll(QueryEntry.applyConstraint(backendTl, tokens, true));
                                    count_backend.set(tl.size());
                                    // TODO: read and aggregate aggregations from backend as well
                                }
                                post.recordEvent("backend_time", System.currentTimeMillis() - start);
                            }
                        };
                if (backendThread != null)
                    backendThread.start();

                // wait for termination of all threads
                if (scraperThread != null)
                    try {
                        scraperThread.join(Math.max(10000, timeout - System.currentTimeMillis() + start));
                    } catch (InterruptedException e) {
                    }
                if (localThread != null)
                    try {
                        localThread.join(Math.max(100, timeout - System.currentTimeMillis() + start));
                    } catch (InterruptedException e) {
                    }
                if (backendThread != null)
                    try {
                        backendThread.join(Math.max(100, timeout - System.currentTimeMillis() + start));
                    } catch (InterruptedException e) {
                    }

            } else if ("twitter".equals(source) && tokens.raw.length() > 0) {
                final long start = System.currentTimeMillis();
                final String scraper_query = tokens.translate4scraper();
                DAO.log(request.getServletPath() + " scraping with query: " + scraper_query);
                Timeline twitterTl = DAO.scrapeTwitter(post, scraper_query, order, timezoneOffset, true, timeout,
                        true);
                count_twitter_new.set(twitterTl.size());
                tl.putAll(QueryEntry.applyConstraint(twitterTl, tokens, false)); // pre-localized results are not filtered with location constraint any more 
                tl.setScraperInfo(twitterTl.getScraperInfo());
                post.recordEvent("twitterscraper_time", System.currentTimeMillis() - start);
                // in this case we use all tweets, not only the latest one because it may happen that there are no new and that is not what the user expects

            } else if ("cache".equals(source)) {
                final long start = System.currentTimeMillis();
                DAO.SearchLocalMessages localSearchResult = new DAO.SearchLocalMessages(query, order,
                        timezoneOffset, count, limit, fields);
                cache_hits.set(localSearchResult.timeline.getHits());
                tl.putAll(localSearchResult.timeline);
                aggregations = localSearchResult.aggregations;
                post.recordEvent("cache_time", System.currentTimeMillis() - start);

            } else if ("backend".equals(source) && query.length() > 0) {
                final long start = System.currentTimeMillis();
                Timeline backendTl = DAO.searchBackend(query, order, count, timezoneOffset, "cache", timeout);
                if (backendTl != null) {
                    tl.putAll(QueryEntry.applyConstraint(backendTl, tokens, true));
                    tl.setScraperInfo(backendTl.getScraperInfo());
                    // TODO: read and aggregate aggregations from backend as well
                    count_backend.set(tl.size());
                }
                post.recordEvent("backend_time", System.currentTimeMillis() - start);

            }

            final long start = System.currentTimeMillis();
            // check the latest user_ids
            DAO.announceNewUserId(tl);

            // reduce the list to the wanted number of results if we have more
            tl.reduceToMaxsize(count);

            if (post.isDoS_servicereduction() && !RemoteAccess.isSleepingForClient(post.getClientHost())) {
                RemoteAccess.sleep(post.getClientHost(), 2000);
            }

            // create json or xml according to path extension
            int shortlink_iflinkexceedslength = (int) DAO.getConfig("shortlink.iflinkexceedslength", 500L);
            String shortlink_urlstub = DAO.getConfig("shortlink.urlstub", "http://localhost:9000");
            if (jsonExt) {
                post.setResponse(response, jsonp ? "application/javascript" : "application/json");
                // generate json
                Map<String, Object> m = new LinkedHashMap<String, Object>();
                Map<String, Object> metadata = new LinkedHashMap<String, Object>();
                if (!minified) {
                    m.put("readme_0",
                            "THIS JSON IS THE RESULT OF YOUR SEARCH QUERY - THERE IS NO WEB PAGE WHICH SHOWS THE RESULT!");
                    m.put("readme_1",
                            "loklak.org is the framework for a message search system, not the portal, read: http://loklak.org/about.html#notasearchportal");
                    m.put("readme_2",
                            "This is supposed to be the back-end of a search portal. For the api, see http://loklak.org/api.html");
                    m.put("readme_3",
                            "Parameters q=(query), source=(cache|backend|twitter|all), callback=p for jsonp, maximumRecords=(message count), minified=(true|false)");
                }
                metadata.put("itemsPerPage", Integer.toString(count));
                metadata.put("count", Integer.toString(tl.size()));
                metadata.put("count_twitter_all", count_twitter_all.get());
                metadata.put("count_twitter_new", count_twitter_new.get());
                metadata.put("count_backend", count_backend.get());
                metadata.put("count_cache", cache_hits.get());
                metadata.put("hits", Math.max(cache_hits.get(), tl.size()));
                if (order == Timeline.Order.CREATED_AT)
                    metadata.put("period", tl.period());
                metadata.put("query", query);
                metadata.put("client", post.getClientHost());
                metadata.put("time", System.currentTimeMillis() - post.getAccessTime());
                metadata.put("servicereduction", post.isDoS_servicereduction() ? "true" : "false");
                if (tl.getScraperInfo().length() > 0)
                    metadata.put("scraperInfo", tl.getScraperInfo());
                m.put("search_metadata", metadata);
                List<Object> statuses = new ArrayList<>();
                try {
                    for (MessageEntry t : tl) {
                        UserEntry u = tl.getUser(t);
                        if (DAO.getConfig("flag.fixunshorten", false))
                            t.setText(TwitterScraper
                                    .unshorten(t.getText(shortlink_iflinkexceedslength, shortlink_urlstub)));
                        statuses.add(t.toMap(u, true, shortlink_iflinkexceedslength, shortlink_urlstub));
                    }
                } catch (ConcurrentModificationException e) {
                    // late incoming messages from concurrent peer retrieval may cause this
                    // we silently do nothing here and return what we listed so far
                }
                m.put("statuses", statuses);

                // aggregations
                Map<String, Object> agg = new LinkedHashMap<String, Object>();
                if (aggregations != null) {
                    for (Map.Entry<String, List<Map.Entry<String, Long>>> aggregation : aggregations.entrySet()) {
                        Map<String, Object> facet = new LinkedHashMap<>();
                        for (Map.Entry<String, Long> a : aggregation.getValue()) {
                            if (a.getValue().equals(query))
                                continue; // we omit obvious terms that cannot be used for faceting, like search for "#abc" -> most hashtag is "#abc"
                            facet.put(a.getKey(), a.getValue());
                        }
                        agg.put(aggregation.getKey(), facet);
                    }
                }
                m.put("aggregations", agg);

                // write json
                response.setCharacterEncoding("UTF-8");
                PrintWriter sos = response.getWriter();
                if (jsonp)
                    sos.print(callback + "(");
                sos.print(minified ? new ObjectMapper().writer().writeValueAsString(m)
                        : new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(m));
                if (jsonp)
                    sos.println(");");
                sos.println();
            } else if (rssExt) {
                response.setCharacterEncoding("UTF-8");
                post.setResponse(response, "application/rss+xml;charset=utf-8");
                // generate xml
                RSSMessage channel = new RSSMessage();
                channel.setPubDate(new Date());
                channel.setTitle("RSS feed for Twitter search for " + query);
                channel.setDescription("");
                channel.setLink("");
                RSSFeed feed = new RSSFeed(tl.size());
                feed.setChannel(channel);
                try {
                    for (MessageEntry t : tl) {
                        UserEntry u = tl.getUser(t);
                        RSSMessage m = new RSSMessage();
                        m.setLink(t.getStatusIdUrl().toExternalForm());
                        m.setAuthor(u.getName() + " @" + u.getScreenName());
                        m.setTitle(u.getName() + " @" + u.getScreenName());
                        m.setDescription(t.getText(shortlink_iflinkexceedslength, shortlink_urlstub));
                        m.setPubDate(t.getCreatedAt());
                        m.setGuid(t.getIdStr());
                        feed.addMessage(m);
                    }
                } catch (ConcurrentModificationException e) {
                    // late incoming messages from concurrent peer retrieval may cause this
                    // we silently do nothing here and return what we listed so far
                }
                String rss = feed.toString();
                //System.out.println("feed has " + feed.size() + " entries");

                // write xml
                response.getOutputStream().write(UTF8.getBytes(rss));
            } else if (txtExt) {
                post.setResponse(response, "text/plain");
                final StringBuilder buffer = new StringBuilder(1000);
                try {
                    for (MessageEntry t : tl) {
                        UserEntry u = tl.getUser(t);
                        buffer.append(t.getCreatedAt()).append(" ").append(u.getScreenName()).append(": ")
                                .append(t.getText(shortlink_iflinkexceedslength, shortlink_urlstub)).append('\n');
                    }
                } catch (ConcurrentModificationException e) {
                    // late incoming messages from concurrent peer retrieval may cause this
                    // we silently do nothing here and return what we listed so far
                }
                response.getOutputStream().write(UTF8.getBytes(buffer.toString()));
            }
            post.recordEvent("result_count", tl.size());
            post.recordEvent("postprocessing_time", System.currentTimeMillis() - start);
            Map<String, Object> hits = new LinkedHashMap<>();
            hits.put("count_twitter_all", count_twitter_all.get());
            hits.put("count_twitter_new", count_twitter_new.get());
            hits.put("count_backend", count_backend.get());
            hits.put("cache_hits", cache_hits.get());
            post.recordEvent("hits", hits);
            DAO.log(request.getServletPath() + "?" + request.getQueryString() + " -> " + tl.size()
                    + " records returned, " + count_twitter_new.get() + " new");
            post.finalize();
        } catch (Throwable e) {
            Log.getLog().warn(e.getMessage(), e);
            //e.printStackTrace();
        }
    }

}