org.loklak.harvester.TwitterScraper.java Source code

Java tutorial

Introduction

Here is the source code for org.loklak.harvester.TwitterScraper.java

Source

/**
 *  TwitterScraper
 *  Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.harvester;

import static org.apache.http.util.EntityUtils.consumeQuietly;
import static org.loklak.http.ClientConnection.getCustomClosableHttpClient;
import static org.loklak.http.ClientConnection.getHTML;

import org.loklak.objects.AbstractObjectEntry;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.json.JSONObject;
import org.loklak.data.Classifier;
import org.loklak.data.DAO;
import org.loklak.data.Classifier.Category;
import org.loklak.data.Classifier.Context;
import org.loklak.geo.GeoMark;
import org.loklak.geo.LocationSource;
import org.loklak.objects.QueryEntry.PlaceContext;
import org.loklak.tools.bayes.Classification;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.json.JSONArray;
import org.json.JSONException;
import org.loklak.data.IncomingMessageBuffer;
import org.loklak.http.ClientConnection;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.ProviderType;
import org.loklak.objects.SourceType;
import org.loklak.objects.Timeline;
import org.loklak.objects.UserEntry;

public class TwitterScraper {

    public static final ExecutorService executor = Executors.newFixedThreadPool(40);
    public static final Pattern emoji_pattern_span = Pattern.compile(
            "<span [^>]*class=\"Emoji Emoji--forLinks\" [^>]*>[\\n]*[^<]*</span>[\\n]*<span [^>]*class=\"visuallyhidden\" [^>]*aria-hidden=\"true\"[^>]*>[\\n]*([^<]*)[\\n]*</span>");
    private static final Pattern bearerJsUrlRegex = Pattern.compile("showFailureMessage\\(\\'(.*?main.*?)\\'\\);");
    private static final Pattern guestTokenRegex = Pattern
            .compile("document\\.cookie \\= decodeURIComponent\\(\\\"gt\\=([0-9]+);");
    private static final Pattern bearerTokenRegex = Pattern.compile("BEARER_TOKEN:\\\"(.*?)\\\"");

    public static Timeline search(final String query, final ArrayList<String> filterList,
            final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend, int jointime) {
        Timeline[] tl = search(query, filterList, order, writeToIndex, writeToBackend);
        long timeout = System.currentTimeMillis() + jointime;
        long remainingWait = 0;
        for (TwitterTweet tt : tl[1]) {
            remainingWait = Math.max(10, timeout - System.currentTimeMillis());
            if (tt.waitReady(remainingWait)) {
                // double additions are detected
                tl[0].add(tt, tt.getUser());
            }
        }
        return tl[0];
    }

    public static Timeline search(final String query, final Timeline.Order order, final boolean writeToIndex,
            final boolean writeToBackend, int jointime) {

        return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend, jointime);
    }

    private static String prepareSearchUrl(final String query, final ArrayList<String> filterList) {
        // check
        // https://twitter.com/search-advanced for a better syntax
        // build queries like https://twitter.com/search?f=tweets&vertical=default&q=kaffee&src=typd
        // https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search#
        String httpsUrl = "";
        String type = "tweets";
        try {

            // query q
            StringBuilder t = new StringBuilder(query.length());
            for (String s : query.replace('+', ' ').split(" ")) {
                t.append(' ');
                if (s.startsWith("since:") || s.startsWith("until:")) {
                    int u = s.indexOf('_');
                    t.append(u < 0 ? s : s.substring(0, u));
                } else {
                    t.append(s);
                }
            }
            String q = t.length() == 0 ? "*" : URLEncoder.encode(t.substring(1), "UTF-8");

            // type of content to fetch
            if (filterList.contains("video") && filterList.size() == 1) {
                type = "videos";
            }

            // building url
            httpsUrl = "https://twitter.com/search?f=" + type + "&vertical=default&q=" + q + "&src=typd";

        } catch (UnsupportedEncodingException e) {
        }
        return httpsUrl;
    }

    @SuppressWarnings("unused")
    private static Timeline[] search(final String query, final Timeline.Order order, final boolean writeToIndex,
            final boolean writeToBackend) {
        return search(query, new ArrayList<>(), order, writeToIndex, writeToBackend);
    }

    private static Timeline[] search(final String query, final ArrayList<String> filterList,
            final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend) {
        // check
        // https://twitter.com/search-advanced for a better syntax
        // https://support.twitter.com/articles/71577-how-to-use-advanced-twitter-search#
        String https_url = prepareSearchUrl(query, filterList);
        Timeline[] timelines = null;
        try {
            ClientConnection connection = new ClientConnection(https_url);
            if (connection.inputStream == null)
                return null;
            try {
                BufferedReader br = new BufferedReader(
                        new InputStreamReader(connection.inputStream, StandardCharsets.UTF_8));

                timelines = search(br, filterList, order, writeToIndex, writeToBackend);
            } catch (IOException e) {
                DAO.severe(e);
            } finally {
                connection.close();
            }
        } catch (IOException e) {
            // this could mean that twitter rejected the connection (DoS protection?) or we are offline (we should be silent then)
            // DAO.severe(e);
            if (timelines == null)
                timelines = new Timeline[] { new Timeline(order), new Timeline(order) };
        }
        ;

        // wait until all messages in the timeline are ready
        if (timelines == null) {
            // timeout occurred
            timelines = new Timeline[] { new Timeline(order), new Timeline(order) };
        }
        if (timelines != null) {
            if (timelines[0] != null)
                timelines[0].setScraperInfo("local");
            if (timelines[1] != null)
                timelines[1].setScraperInfo("local");
        }
        return timelines;
    }

    private static Timeline[] parse(final File file, final Timeline.Order order, final boolean writeToIndex,
            final boolean writeToBackend) {
        return parse(file, new ArrayList<>(), order, writeToIndex, writeToBackend);
    }

    private static Timeline[] parse(final File file, final ArrayList<String> filterList, final Timeline.Order order,
            final boolean writeToIndex, final boolean writeToBackend) {
        Timeline[] timelines = null;
        try {
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
            timelines = search(br, filterList, order, writeToIndex, writeToBackend);
        } catch (IOException e) {
            DAO.severe(e);
        } finally {
            if (timelines == null)
                timelines = new Timeline[] { new Timeline(order), new Timeline(order) };
        }

        if (timelines[0] != null)
            timelines[0].setScraperInfo("local");
        if (timelines[1] != null)
            timelines[1].setScraperInfo("local");
        return timelines;
    }

    private static Timeline[] search(final BufferedReader br, final Timeline.Order order,
            final boolean writeToIndex, final boolean writeToBackend) throws IOException {

        return search(br, new ArrayList<>(), order, writeToIndex, writeToBackend);
    }

    /**
     * scrape messages from the reader stream: this already checks if a message is new. There are only new messages returned
     * @param br
     * @param order
     * @return two timelines in one array: Timeline[0] is the one which is finished to be used, Timeline[1] contains messages which are in postprocessing
     * @throws IOException
     */
    private static Timeline[] search(final BufferedReader br, final ArrayList<String> filterList,
            final Timeline.Order order, final boolean writeToIndex, final boolean writeToBackend)
            throws IOException {
        Timeline timelineReady = new Timeline(order);
        Timeline timelineWorking = new Timeline(order);
        String input;
        Map<String, prop> props = new HashMap<String, prop>();
        Set<String> images = null;
        Set<String> videos = null;
        String place_id = "";
        String place_name = "";
        boolean parsing_favourite = false, parsing_retweet = false;
        int line = 0; // first line is 1, according to emacs which numbers the first line also as 1
        boolean debuglog = DAO.getConfig("flag.debug.twitter_scraper", "false").equals("true");

        while ((input = br.readLine()) != null) {
            line++;
            input = input.trim();

            if (input.length() == 0)
                continue;

            // debug
            if (debuglog)
                DAO.log(line + ": " + input);
            //if (input.indexOf("ProfileTweet-actionCount") > 0) DAO.log(input);

            // parse
            int p;
            if ((p = input.indexOf("=\"account-group")) > 0) {
                props.put("userid", new prop(input, p, "data-user-id"));
                continue;
            }
            if ((p = input.indexOf("class=\"avatar js-action-profile-avatar")) > 0) {
                props.put("useravatarurl", new prop(input, p, "src"));
                continue;
            }
            if ((p = input.indexOf("data-name=")) >= 0) {
                props.put("userfullname", new prop(input, p, "data-name"));
                // don't continue here, username is in the same line
            }
            if ((p = input.indexOf("data-screen-name=")) >= 0) {
                props.put("usernickname", new prop(input, p, "data-screen-name"));
                // don't continue here, fullname is in the same line
            }
            if ((p = input.indexOf("class=\"tweet-timestamp")) > 0) {
                props.put("tweetstatusurl", new prop(input, 0, "href"));
                props.put("tweettimename", new prop(input, p, "title"));
                // don't continue here because "class=\"_timestamp" is in the same line
            }
            if ((p = input.indexOf("class=\"_timestamp")) > 0) {
                props.put("tweettimems", new prop(input, p, "data-time-ms"));
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-action--retweet")) > 0) {
                parsing_retweet = true;
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-action--favorite")) > 0) {
                parsing_favourite = true;
                continue;
            }
            if ((p = input.indexOf("class=\"TweetTextSize")) > 0) {
                // read until closing p tag to account for new lines in tweets
                while (input.lastIndexOf("</p>") == -1) {
                    input = input + ' ' + br.readLine();
                }
                prop tweettext = new prop(input, p, null);
                props.put("tweettext", tweettext);
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-actionCount")) > 0) {
                if (parsing_retweet) {
                    prop tweetretweetcount = new prop(input, p, "data-tweet-stat-count");
                    props.put("tweetretweetcount", tweetretweetcount);
                    parsing_retweet = false;
                }
                if (parsing_favourite) {
                    props.put("tweetfavouritecount", new prop(input, p, "data-tweet-stat-count"));
                    parsing_favourite = false;
                }
                continue;
            }
            // get images
            if (videos == null)
                images = new HashSet<>();
            if ((p = input.indexOf("<img")) >= 0) {
                String img_link = new prop(input, p, "src").value;
                if (img_link != null && img_link.contains("pbs.twimg.com/media/")) {
                    images.add(img_link);
                    continue;
                }
            }
            // we have two opportunities to get video thumbnails == more images; images in the presence of video content should be treated as thumbnail for the video
            if (videos == null)
                videos = new HashSet<>();
            if ((p = input.indexOf("class=\"animated-gif-thumbnail\"")) > 0) {
                String image_url = new prop(input, 0, "src").value;
                images.add(image_url);
                continue;
            }
            if ((p = input.indexOf("class=\"animated-gif\"")) > 0) {
                String image_url = new prop(input, p, "poster").value;
                images.add(image_url);
                continue;
            }
            if ((p = input.indexOf("<source video-src")) >= 0 && input.indexOf("type=\"video/") > p) {
                String video_url = new prop(input, p, "video-src").value;
                videos.add(video_url);
                continue;
            }
            if (input.indexOf("AdaptiveMedia-videoContainer") > 0) {
                /* String tweetUrl = props.get("tweetstatusurl").value;
                 * String[] videoUrls = fetchTwitterVideos(tweetUrl);
                 * Collections.addAll(videos, videoUrls);
                 *
                 * Not a good idea to fetch video right now. Need to add another endpoint which
                 * lets end users fetch complete videos from here.
                 * See https://github.com/loklak/loklak_server/issues/1298
                 **/
            }
            if ((p = input.indexOf("class=\"Tweet-geo")) > 0) {
                prop place_name_prop = new prop(input, p, "title");
                place_name = place_name_prop.value;
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-actionButton u-linkClean js-nav js-geo-pivot-link")) > 0) {
                prop place_id_prop = new prop(input, p, "data-place-id");
                place_id = place_id_prop.value;
                continue;
            }

            if (props.size() == 10 || (debuglog && props.size() > 4 && input.indexOf("stream-item") > 0)) {

                if (!filterPosts(filterList, props, videos, images)) {
                    props = new HashMap<String, prop>();
                    place_id = "";
                    place_name = "";
                    continue;
                }

                //TODO: Add more filters

                // the tweet is complete, evaluate the result
                if (debuglog)
                    DAO.log("*** line " + line + " propss.size() = " + props.size());
                prop userid = props.get("userid");
                if (userid == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value userid");
                    continue;
                }
                prop usernickname = props.get("usernickname");
                if (usernickname == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value usernickname");
                    continue;
                }
                prop useravatarurl = props.get("useravatarurl");
                if (useravatarurl == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value useravatarurl");
                    continue;
                }
                prop userfullname = props.get("userfullname");
                if (userfullname == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value userfullname");
                    continue;
                }
                UserEntry user = new UserEntry(userid.value, usernickname.value, useravatarurl.value,
                        MessageEntry.html2utf8(userfullname.value));

                prop tweettimems = props.get("tweettimems");
                if (tweettimems == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value tweettimems");
                    continue;
                }
                prop tweetretweetcount = props.get("tweetretweetcount");
                if (tweetretweetcount == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value tweetretweetcount");
                    continue;
                }
                prop tweetfavouritecount = props.get("tweetfavouritecount");
                if (tweetfavouritecount == null) {
                    if (debuglog)
                        DAO.log("*** line " + line + " MISSING value tweetfavouritecount");
                    continue;
                }

                TwitterTweet tweet = new TwitterTweet(user.getScreenName(), Long.parseLong(tweettimems.value),
                        props.get("tweettimename").value, props.get("tweetstatusurl").value,
                        props.get("tweettext").value, Long.parseLong(tweetretweetcount.value),
                        Long.parseLong(tweetfavouritecount.value), images, videos, place_name, place_id, user,
                        writeToIndex, writeToBackend);
                if (DAO.messages == null || !DAO.messages.existsCache(tweet.getPostId())) {
                    // checking against the exist cache is incomplete. A false negative would just cause that a tweet is
                    // indexed again.
                    if (tweet.willBeTimeConsuming()) {
                        executor.execute(tweet);
                        //new Thread(tweet).start();
                        // because the executor may run the thread in the current thread it could be possible that the result is here already
                        if (tweet.isReady()) {

                            timelineReady.add(tweet, user);
                            //DAO.log("SCRAPERTEST: messageINIT is ready");
                        } else {
                            timelineWorking.add(tweet, user);
                            //DAO.log("SCRAPERTEST: messageINIT unshortening");
                        }
                    } else {
                        // no additional thread needed, run the postprocessing in the current thread
                        tweet.run();
                        timelineReady.add(tweet, user);
                    }
                }
                videos = null;
                images = null;
                props.clear();
                continue;
            }
        }
        //for (prop p: props.values()) System.out.println(p);
        br.close();

        return new Timeline[] { timelineReady, timelineWorking };
    }

    public static String[] fetchTwitterVideos(String tweetUrl) {
        // Extract BEARER_TOKEN holding js and Guest token
        String mobileUrl = "https://mobile.twitter.com" + tweetUrl;
        String bearerJsUrl = null;
        String guestToken = null;
        String bearerToken = null;
        try {
            ClientConnection conn = new ClientConnection(mobileUrl);
            BufferedReader br = new BufferedReader(new InputStreamReader(conn.inputStream, StandardCharsets.UTF_8));
            String line;
            while ((line = br.readLine()) != null) {
                if (bearerJsUrl != null && guestToken != null) {
                    // Both the entities are found
                    break;
                }
                if (line.length() == 0) {
                    continue;
                }
                Matcher m = bearerJsUrlRegex.matcher(line);
                if (m.find()) {
                    bearerJsUrl = m.group(1);
                    continue;
                }
                m = guestTokenRegex.matcher(line);
                if (m.find()) {
                    guestToken = m.group(1);
                }
            }
        } catch (IOException e) {
            DAO.severe("Unable to open mobile URL: " + mobileUrl, e);
            return new String[] {};
        }

        // Get BEARER_TOKEN from bearer token holder JS
        try {
            bearerToken = getBearerTokenFromJs(bearerJsUrl);
        } catch (IOException e) {
            DAO.severe("Error while fetching BEARER_TOKEN", e);
            return new String[] {};
        }

        try {
            int slashIndex = tweetUrl.lastIndexOf('/');
            String tweetId = tweetUrl.substring(slashIndex + 1);
            return getConversationVideos(tweetId, bearerToken, guestToken);
        } catch (IOException e) {
            DAO.severe("Error while getting data JSON for Tweet " + tweetUrl, e);
        }
        return new String[] {};
    }

    private static String[] getConversationVideos(String tweetId, String bearerToken, String guestToken)
            throws IOException {
        String conversationApiUrl = "https://api.twitter.com/2/timeline/conversation/" + tweetId + ".json";
        CloseableHttpClient httpClient = getCustomClosableHttpClient(true);
        HttpGet req = new HttpGet(conversationApiUrl);
        req.setHeader("User-Agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36");
        req.setHeader("Authorization", "Bearer " + bearerToken);
        req.setHeader("x-guest-token", guestToken);
        HttpEntity entity = httpClient.execute(req).getEntity();
        String html = getHTML(entity);
        consumeQuietly(entity);
        try {
            JSONArray arr = (new JSONObject(html)).getJSONObject("globalObjects").getJSONObject("tweets")
                    .getJSONObject(tweetId).getJSONObject("extended_entities").getJSONArray("media");
            JSONObject obj2 = (JSONObject) arr.get(0);
            JSONArray videos = obj2.getJSONObject("video_info").getJSONArray("variants");
            ArrayList<String> urls = new ArrayList<>();
            for (int i = 0; i < videos.length(); i++) {
                String url = ((JSONObject) videos.get(i)).getString("url");
                urls.add(url);
            }
            return urls.toArray(new String[urls.size()]);
        } catch (JSONException e) {
            // This is not an issue. Sometimes, there are videos in long conversations but other ones get media class
            //  div, so this fetching process is triggered.
            DAO.severe("Error while parsing videos from conversation JSON for " + tweetId, e);
        }
        return new String[] {};
    }

    private static String getBearerTokenFromJs(String jsUrl) throws IOException {
        ClientConnection conn = new ClientConnection(jsUrl);
        BufferedReader br = new BufferedReader(new InputStreamReader(conn.inputStream, StandardCharsets.UTF_8));
        String line = br.readLine();
        Matcher m = bearerTokenRegex.matcher(line);
        if (m.find()) {
            return m.group(1);
        }
        throw new IOException("Couldn't get BEARER_TOKEN");
    }

    /**
     * Filter Posts(here tweets) according to values.
     *   image: filter tweets with images, neglect 'tweets without images'
     *   video: filter tweets also having video and other values like image. For only value as video,
     *          tweets with videos are filtered in prepareUrl() method
     */
    private static boolean filterPosts(ArrayList<String> filterList, Map<String, prop> props, Set<String> videos,
            Set<String> images) {
        Matcher matchVideo1;
        Matcher matchVideo2;
        Pattern[] videoUrlPatterns = { Pattern.compile("youtu.be\\/[0-9A-z]+"),
                Pattern.compile("youtube.com\\/watch?v=[0-9A-z]+") };

        // Filter tweets with videos and others
        if (filterList.contains("video") && filterList.size() > 1) {
            matchVideo1 = videoUrlPatterns[0].matcher(props.get("tweettext").value);
            matchVideo2 = videoUrlPatterns[1].matcher(props.get("tweettext").value);

            if (!matchVideo1.find() && !matchVideo2.find() && videos.size() < 1) {
                return false;
            }
        }

        // Filter tweets with images
        if (filterList.contains("image") && images.size() < 1) {
            return false;
        }

        //TODO: Add more filters

        return true;
    }

    private static class prop {
        public String key, value = null;

        public prop(String value) {
            this.key = null;
            this.value = value;
        }

        public prop(String line, int start, String key) {
            this.key = key;
            if (key == null) {
                int p = line.indexOf('>', start);
                if (p > 0) {
                    int c = 1;
                    int q = p + 1;
                    while (c > 0 && q < line.length()) {
                        char a = line.charAt(q);
                        if (a == '<') {
                            if (line.charAt(q + 1) != 'i') {
                                if (line.charAt(q + 1) == '/')
                                    c--;
                                else
                                    c++;
                            }
                        }
                        q++;
                    }
                    assert p >= -1;
                    assert q > 0;
                    try {
                        value = line.substring(p + 1, q - 1);
                    } catch (StringIndexOutOfBoundsException e) {
                        DAO.debug(e);
                    }
                }
            } else {
                int p = line.indexOf(key + "=\"", start);
                if (p >= 0) {
                    int q = line.indexOf('"', p + key.length() + 2);
                    if (q > 0) {
                        value = line.substring(p + key.length() + 2, q);
                    }
                }
            }
        }

        @SuppressWarnings("unused")
        public boolean success() {
            return value != null;
        }

        public String toString() {
            return this.key + "=" + (this.value == null ? "unknown" : this.value);
        }
    }

    final static Pattern hashtag_pattern = Pattern
            .compile("<a href=\"/hashtag/.*?\".*?class=\"twitter-hashtag.*?\".*?><s>#</s><b>(.*?)</b></a>");
    final static Pattern timeline_link_pattern = Pattern.compile(
            "<a href=\"https://(.*?)\".*? data-expanded-url=\"(.*?)\".*?twitter-timeline-link.*?title=\"(.*?)\".*?>.*?</a>");
    final static Pattern timeline_embed_pattern = Pattern
            .compile("<a href=\"(https://t.co/\\w+)\" class=\"twitter-timeline-link.*?>pic.twitter.com/(.*?)</a>");
    final static Pattern emoji_pattern = Pattern
            .compile("<img .*?class=\"Emoji Emoji--forText\".*?alt=\"(.*?)\".*?>");
    final static Pattern doublespace_pattern = Pattern.compile("  ");
    final static Pattern cleanup_pattern = Pattern.compile("</?(s|b|strong)>|" + "<a href=\"/hashtag.*?>|"
            + "<a.*?class=\"twitter-atreply.*?>|" + "<span.*?span>");

    public static class TwitterTweet extends Post implements Runnable {

        public final Semaphore ready;
        public MessageEntry moreData = new MessageEntry();
        public UserEntry user;
        public boolean writeToIndex;
        public boolean writeToBackend;

        // a time stamp that is given in loklak upon the arrival of the tweet which is the current local time
        public Date timestampDate;
        // the time given in the tweet which is the time when the user created it.
        // This is also use to do the index partition into minute, hour, week
        public Date created_at;
        // on means 'valid from'
        public Date on;
        // 'to' means 'valid_until' and may not be set
        public Date to;

        // where did the message come from
        protected SourceType source_type;
        // who created the message
        protected ProviderType provider_type;

        public String provider_hash, screen_name, retweet_from, postId, canonical_id, parent, text;
        protected URL status_id_url;
        protected long retweet_count, favourites_count;
        public Set<String> images, audios, videos;
        protected String place_name, place_id;

        // the following fields are either set as a common field or generated by extraction from field 'text' or from field 'place_name'
        // coordinate order is [longitude, latitude]
        protected double[] location_point, location_mark;
        // Value in metres
        protected int location_radius;
        protected LocationSource location_source;
        protected PlaceContext place_context;
        protected String place_country;

        // The length of tweets without links, users, hashtags
        // the following can be computed from the tweet data but is stored in the search index
        // to provide statistical data and ranking attributes
        private int without_l_len, without_lu_len, without_luh_len;

        // the arrays of links, users, hashtags
        private List<String> users, hosts, links, mentions, hashtags;

        private boolean enriched;

        public TwitterTweet(final String user_screen_name_raw, final long created_at_raw,
                // Not used here but should be compared to created_at_raw
                final String created_at_name_raw, final String status_id_url_raw, final String text_raw,
                final long retweets, final long favourites, final Set<String> images, final Set<String> videos,
                final String place_name, final String place_id, final UserEntry user, final boolean writeToIndex,
                final boolean writeToBackend) throws MalformedURLException {
            super();
            this.source_type = SourceType.TWITTER;
            this.provider_type = ProviderType.SCRAPED;
            this.screen_name = user_screen_name_raw;
            this.created_at = new Date(created_at_raw);
            this.status_id_url = new URL("https://twitter.com" + status_id_url_raw);
            int p = status_id_url_raw.lastIndexOf('/');
            this.postId = p >= 0 ? status_id_url_raw.substring(p + 1) : "-1";
            this.retweet_count = retweets;
            this.favourites_count = favourites;
            this.place_name = place_name;
            this.place_id = place_id;
            this.images = images;
            this.videos = videos;
            this.text = text_raw;
            this.user = user;
            this.writeToIndex = writeToIndex;
            this.writeToBackend = writeToBackend;

            //Date d = new Date(timemsraw);
            //System.out.println(d);

            /* failed to reverse-engineering the place_id :(
            if (place_id.length() == 16) {
            String a = place_id.substring(0, 8);
            String b = place_id.substring(8, 16);
            long an = Long.parseLong(a, 16);
            long bn = Long.parseLong(b, 16);
            System.out.println("place = " + place_name + ", a = " + an + ", b = " + bn);
            // Frankfurt a = 3314145750, b = 3979907708, http://www.openstreetmap.org/#map=15/50.1128/8.6835
            // Singapore a = 1487192992, b = 3578663936
            }
            */

            // this.text MUST be analysed with analyse(); this is not done here because it should be started concurrently; run run();

            this.ready = new Semaphore(0);
        }

        public TwitterTweet(JSONObject json) {
            this.moreData = new MessageEntry();
            Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME);
            this.timestampDate = MessageEntry.parseDate(timestamp_obj);
            this.timestamp = this.timestampDate.getTime();
            Object created_at_obj = lazyGet(json, AbstractObjectEntry.CREATED_AT_FIELDNAME);
            this.created_at = MessageEntry.parseDate(created_at_obj);
            Object on_obj = lazyGet(json, "on");
            this.on = on_obj == null ? null : MessageEntry.parseDate(on);
            Object to_obj = lazyGet(json, "to");
            this.to = to_obj == null ? null : MessageEntry.parseDate(to);
            String source_type_string = (String) lazyGet(json, "source_type");
            try {
                this.source_type = source_type_string == null ? SourceType.GENERIC
                        : SourceType.byName(source_type_string);
            } catch (IllegalArgumentException e) {
                this.source_type = SourceType.GENERIC;
            }
            String provider_type_string = (String) lazyGet(json, "provider_type");
            if (provider_type_string == null)
                provider_type_string = ProviderType.NOONE.name();
            try {
                this.provider_type = ProviderType.valueOf(provider_type_string);
            } catch (IllegalArgumentException e) {
                this.provider_type = ProviderType.NOONE;
            }
            this.provider_hash = (String) lazyGet(json, "provider_hash");
            this.screen_name = (String) lazyGet(json, "screen_name");
            this.retweet_from = (String) lazyGet(json, "retweet_from");
            this.postId = (String) lazyGet(json, "id_str");
            this.text = (String) lazyGet(json, "text");
            try {
                this.status_id_url = new URL((String) lazyGet(json, "link"));
            } catch (MalformedURLException e) {
                this.status_id_url = null;
            }
            this.retweet_count = MessageEntry.parseLong((Number) lazyGet(json, "retweet_count"));
            this.favourites_count = MessageEntry.parseLong((Number) lazyGet(json, "favourites_count"));
            this.images = MessageEntry.parseArrayList(lazyGet(json, "images"));
            this.audios = MessageEntry.parseArrayList(lazyGet(json, "audio"));
            this.videos = MessageEntry.parseArrayList(lazyGet(json, "videos"));
            this.place_id = MessageEntry.parseString((String) lazyGet(json, "place_id"));
            this.place_name = MessageEntry.parseString((String) lazyGet(json, "place_name"));
            this.place_country = MessageEntry.parseString((String) lazyGet(json, "place_country"));

            if (this.place_country != null && this.place_country.length() != 2)
                this.place_country = null;

            // optional location
            Object location_point_obj = lazyGet(json, "location_point");
            Object location_radius_obj = lazyGet(json, "location_radius");
            Object location_mark_obj = lazyGet(json, "location_mark");
            Object location_source_obj = lazyGet(json, "location_source");
            if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>)
                    || !(location_mark_obj instanceof List<?>)) {
                this.location_point = null;
                this.location_radius = 0;
                this.location_mark = null;
                this.location_source = null;
            } else {
                this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0),
                        (Double) ((List<?>) location_point_obj).get(1) };
                this.location_radius = (int) MessageEntry.parseLong((Number) location_radius_obj);
                this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0),
                        (Double) ((List<?>) location_mark_obj).get(1) };
                this.location_source = LocationSource.valueOf((String) location_source_obj);
            }
            this.enriched = false;

            // load enriched data
            enrich();

            // may lead to error!!
            this.ready = new Semaphore(0);
            //this.user = null;
            //this.writeToIndex = false;
            //this.writeToBackend = false;
        }

        public TwitterTweet() throws MalformedURLException {
            this.moreData = new MessageEntry();
            this.timestamp = new Date().getTime();
            this.timestampDate = new Date(this.timestamp);
            this.created_at = new Date();
            this.on = null;
            this.to = null;
            this.source_type = SourceType.GENERIC;
            this.provider_type = ProviderType.NOONE;
            this.provider_hash = "";
            this.screen_name = "";
            this.retweet_from = "";
            this.postId = "";
            this.canonical_id = "";
            this.parent = "";
            this.text = "";
            this.status_id_url = null;
            this.retweet_count = 0;
            this.favourites_count = 0;
            this.images = new HashSet<String>();
            this.audios = new HashSet<String>();
            this.videos = new HashSet<String>();
            this.place_id = "";
            this.place_name = "";
            this.place_context = null;
            this.place_country = null;
            this.location_point = null;
            this.location_radius = 0;
            this.location_mark = null;
            this.location_source = null;
            this.without_l_len = 0;
            this.without_lu_len = 0;
            this.without_luh_len = 0;
            this.hosts = new ArrayList<String>();
            this.links = new ArrayList<String>();
            this.mentions = new ArrayList<String>();
            this.hashtags = new ArrayList<String>();
            this.moreData.classifier = null;
            this.enriched = false;

            // may lead to error!!
            this.ready = new Semaphore(0);
            //this.user = null;
            //this.writeToIndex = false;
            //this.writeToBackend = false;
        }

        //TODO: fix the location issue and shift to MessageEntry class
        public void getLocation() {
            if ((this.location_point == null || this.location_point.length == 0) && DAO.geoNames != null) {
                GeoMark loc = null;
                if (place_name != null && this.place_name.length() > 0
                        && (this.location_source == null || this.location_source == LocationSource.ANNOTATION
                                || this.location_source == LocationSource.PLACE)) {
                    loc = DAO.geoNames.analyse(this.place_name, null, 5, Integer.toString(this.text.hashCode()));
                    this.place_context = PlaceContext.FROM;
                    this.location_source = LocationSource.PLACE;
                }
                if (loc == null) {
                    loc = DAO.geoNames.analyse(this.text, this.hashtags.toArray(new String[0]), 5,
                            Integer.toString(this.text.hashCode()));
                    this.place_context = PlaceContext.ABOUT;
                    this.location_source = LocationSource.ANNOTATION;
                }
                if (loc != null) {
                    if (this.place_name == null || this.place_name.length() == 0)
                        this.place_name = loc.getNames().iterator().next();
                    this.location_radius = 0;
                    this.location_point = new double[] { loc.lon(), loc.lat() }; //[longitude, latitude]
                    this.location_mark = new double[] { loc.mlon(), loc.mlat() }; //[longitude, latitude]
                    this.place_country = loc.getISO3166cc();
                }
            }
        }

        /**
         * create enriched data, useful for analytics and ranking:
         * - identify all mentioned users, hashtags and links
         * - count message size without links
         * - count message size without links and without users
         */
        public void enrich() {
            if (this.enriched)
                return;
            this.moreData.classifier = Classifier.classify(this.text);
            enrichData(this.text);
            getLocation();

            this.enriched = true;
        }

        public void enrichData(String inputText) {
            StringBuilder text = new StringBuilder(inputText);
            this.links = this.moreData.extractLinks(text.toString());
            text = new StringBuilder(MessageEntry.SPACEX_PATTERN.matcher(text).replaceAll(" ").trim());
            // Text's length without link
            this.without_l_len = text.length();

            this.hosts = this.moreData.extractHosts(links);

            this.videos = this.moreData.getLinksVideo(this.links, this.videos);
            this.images = this.moreData.getLinksImage(this.links, this.images);
            this.audios = this.moreData.getLinksAudio(this.links, this.audios);

            this.users = this.moreData.extractUsers(text.toString());
            text = new StringBuilder(MessageEntry.SPACEX_PATTERN.matcher(text).replaceAll(" ").trim());
            // Text's length without link and users
            this.without_lu_len = text.length();

            this.mentions = new ArrayList<String>();
            for (int i = 0; i < this.users.size(); i++) {
                this.mentions.add(this.users.get(i).substring(1));
            }

            this.hashtags = this.moreData.extractHashtags(text.toString());
            text = new StringBuilder(MessageEntry.SPACEX_PATTERN.matcher(text).replaceAll(" ").trim());
            // Text's length without link, users and hashtags
            this.without_luh_len = text.length();

        }

        /**
         * Channels on which the Tweet will be published -
         *      all
         *      twitter
         *      twitter/mention/*username*
         *      twitter/user/*username*         (User who posted the Tweet)
         *      twitter/hashtag/*hashtag*
         *      twitter/country/*country code*
         *      twitter/text/*token*
         * @return Array of channels to publish message to
         */
        @Override
        protected String[] getStreamChannels() {
            ArrayList<String> channels = new ArrayList<>();

            for (String mention : this.mentions) {
                channels.add("twitter/mention/" + mention);
            }

            for (String hashtag : this.hashtags) {
                channels.add("twitter/hashtag/" + hashtag);
            }

            channels.add("twitter/user/" + this.getScreenName());
            if (this.place_country != null) {
                channels.add("twitter/country/" + this.place_country);
            }

            for (String token : Classifier.normalize(this.text)) {
                channels.add("twitter/text/" + token);
            }

            channels.add("all");
            channels.add("twitter");

            return channels.toArray(new String[channels.size()]);
        }

        @Override
        public void run() {
            //long start = System.currentTimeMillis();
            try {
                //DAO.log("TwitterTweet [" + this.postId + "] start");
                this.text = unshorten(this.text);
                this.user.setName(unshorten(this.user.getName()));
                //DAO.log("TwitterTweet [" + this.postId + "] unshorten after " + (System.currentTimeMillis() - start) + "ms");
                this.enrich();

                //DAO.log("TwitterTweet [" + this.postId + "] enrich    after " + (System.currentTimeMillis() - start) + "ms");
                if (this.writeToIndex)
                    IncomingMessageBuffer.addScheduler(this, this.user, true);
                //DAO.log("TwitterTweet [" + this.postId + "] write     after " + (System.currentTimeMillis() - start) + "ms");
                if (this.writeToBackend)
                    DAO.outgoingMessages.transmitMessage(this, this.user);
                //DAO.log("TwitterTweet [" + this.postId + "] transmit  after " + (System.currentTimeMillis() - start) + "ms");
            } catch (Throwable e) {
                DAO.severe(e);
            } finally {
                this.ready.release(1000);
            }
        }

        public boolean isReady() {
            if (this.ready == null)
                throw new RuntimeException("isReady() should not be called if postprocessing is not started");
            return this.ready.availablePermits() > 0;
        }

        public boolean waitReady(long millis) {
            if (this.ready == null)
                throw new RuntimeException("waitReady() should not be called if postprocessing is not started");
            if (this.ready.availablePermits() > 0)
                return true;
            try {
                return this.ready.tryAcquire(millis, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e) {
                return false;
            }
        }

        public Post toJSON() {
            // very important to include calculated data here because that is written
            // into the index using the abstract index factory
            return toJSON(null, true, Integer.MAX_VALUE, "");
        }

        public Post toJSON(final UserEntry user, final boolean calculatedData, final int iflinkexceedslength,
                final String urlstub) {

            // tweet data
            this.put(AbstractObjectEntry.TIMESTAMP_FIELDNAME,
                    AbstractObjectEntry.utcFormatter.print(getTimestampDate().getTime()));
            this.put(AbstractObjectEntry.CREATED_AT_FIELDNAME,
                    AbstractObjectEntry.utcFormatter.print(getCreatedAt().getTime()));
            if (this.on != null)
                this.put("on", AbstractObjectEntry.utcFormatter.print(this.on.getTime()));
            if (this.to != null)
                this.put("to", AbstractObjectEntry.utcFormatter.print(this.to.getTime()));
            this.put("screen_name", this.screen_name);
            if (this.retweet_from != null && this.retweet_from.length() > 0)
                this.put("retweet_from", this.retweet_from);
            // the tweet; the cleanup is a helper function which cleans mistakes from the past in scraping
            MessageEntry.TextLinkMap tlm = this.moreData.getText(iflinkexceedslength, urlstub, this.text,
                    this.getLinks(), this.getPostId());
            this.put("text", tlm);
            if (this.status_id_url != null)
                this.put("link", this.status_id_url.toExternalForm());
            this.put("id_str", this.postId);
            if (this.canonical_id != null)
                this.put("canonical_id", this.canonical_id);
            if (this.parent != null)
                this.put("parent", this.parent);
            this.put("source_type", this.source_type.toString());
            this.put("provider_type", this.provider_type.name());
            if (this.provider_hash != null && this.provider_hash.length() > 0)
                this.put("provider_hash", this.provider_hash);
            this.put("retweet_count", this.retweet_count);
            // there is a slight inconsistency here in the plural naming but thats how it is noted in the twitter api
            this.put("favourites_count", this.favourites_count);
            this.put("place_name", this.place_name);
            this.put("place_id", this.place_id);

            // add statistic/calculated data
            if (calculatedData) {

                // text length
                this.put("text_length", this.text.length());

                // location data
                if (this.place_context != null)
                    this.put("place_context", this.place_context.name());
                if (this.place_country != null && this.place_country.length() == 2) {
                    this.put("place_country", DAO.geoNames.getCountryName(this.place_country));
                    this.put("place_country_code", this.place_country);
                    this.put("place_country_center", DAO.geoNames.getCountryCenter(this.place_country));
                }

                // add optional location data. This is written even if calculatedData == false if
                // the source is from REPORT to prevent that it is lost
                if (this.location_point != null && this.location_point.length == 2 && this.location_mark != null
                        && this.location_mark.length == 2) {
                    // reference for this format:
                    // https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-geo-point-type.html#_lat_lon_as_array_5
                    this.put("location_point", this.location_point); // [longitude, latitude]
                    this.put("location_radius", this.location_radius);
                    this.put("location_mark", this.location_mark);
                    this.put("location_source", this.location_source.name());
                }

                // redundant data for enhanced navigation with aggregations
                this.put("hosts", this.hosts);
                this.put("hosts_count", this.hosts.size());
                this.put("links", this.links);
                this.put("links_count", this.links.size());
                this.put("unshorten", tlm.short2long);
                this.put("images", this.images);
                this.put("images_count", this.images.size());
                this.put("audio", this.audios);
                this.put("audio_count", this.audios.size());
                this.put("videos", this.videos);
                this.put("videos_count", this.videos.size());
                this.put("mentions", this.mentions);
                this.put("mentions_count", this.mentions.size());
                this.put("hashtags", this.hashtags);
                this.put("hashtags_count", this.hashtags.size());

                // experimental, for ranking
                this.put("without_l_len", this.without_l_len);
                this.put("without_lu_len", this.without_lu_len);
                this.put("without_luh_len", this.without_luh_len);

                // text classifier
                if (this.moreData.classifier != null) {
                    for (Map.Entry<Context, Classification<String, Category>> c : this.moreData.classifier
                            .entrySet()) {
                        assert c.getValue() != null;
                        // we don't store non-existing classifications
                        if (c.getValue().getCategory() == Classifier.Category.NONE)
                            continue;
                        this.put("classifier_" + c.getKey().name(), c.getValue().getCategory());
                        this.put("classifier_" + c.getKey().name() + "_probability",
                                c.getValue().getProbability() == Float.POSITIVE_INFINITY ? Float.MAX_VALUE
                                        : c.getValue().getProbability());
                    }
                }
            }

            // add user
            if (user != null)
                this.put("user", user.toJSON());
            return this;
        }

        public boolean willBeTimeConsuming() {
            return timeline_link_pattern.matcher(this.text).find();
        }

        public Object lazyGet(JSONObject json, String key) {
            try {
                Object o = json.get(key);
                return o;
            } catch (JSONException e) {
                return null;
            }
        }

        public UserEntry getUser() {
            return this.user;
        }

        public Date getTimestampDate() {
            return this.timestampDate == null ? new Date() : this.timestampDate;
        }

        public Date getCreatedAt() {
            return this.created_at == null ? new Date() : this.created_at;
        }

        public void setCreatedAt(Date created_at) {
            this.created_at = created_at;
        }

        public Date getOn() {
            return this.on;
        }

        public void setOn(Date on) {
            this.on = on;
        }

        public Date getTo() {
            return this.to;
        }

        public void setTo(Date to) {
            this.to = to;
        }

        public SourceType getSourceType() {
            return this.source_type;
        }

        public void setSourceType(SourceType source_type) {
            this.source_type = source_type;
        }

        public ProviderType getProviderType() {
            return provider_type;
        }

        public void setProviderType(ProviderType provider_type) {
            this.provider_type = provider_type;
        }

        public String getProviderHash() {
            return provider_hash;
        }

        public void setProviderHash(String provider_hash) {
            this.provider_hash = provider_hash;
        }

        public String getScreenName() {
            return screen_name;
        }

        public void setScreenName(String user_screen_name) {
            this.screen_name = user_screen_name;
        }

        public String getRetweetFrom() {
            return this.retweet_from;
        }

        public void setRetweetFrom(String retweet_from) {
            this.retweet_from = retweet_from;
        }

        public URL getStatusIdUrl() {
            return this.status_id_url;
        }

        public void setStatusIdUrl(URL status_id_url) {
            this.status_id_url = status_id_url;
        }

        public long getRetweetCount() {
            return retweet_count;
        }

        public void setRetweetCount(long retweet_count) {
            this.retweet_count = retweet_count;
        }

        public long getFavouritesCount() {
            return this.favourites_count;
        }

        public void setFavouritesCount(long favourites_count) {
            this.favourites_count = favourites_count;
        }

        public String getPlaceName() {
            return place_name;
        }

        public void setPlaceName(String place_name, PlaceContext place_context) {
            this.place_name = place_name;
            this.place_context = place_context;
        }

        public PlaceContext getPlaceContext() {
            return place_context;
        }

        public String getPlaceId() {
            return place_id;
        }

        public void setPlaceId(String place_id) {
            this.place_id = place_id;
        }

        /**
         * @return [longitude, latitude]
         */
        public double[] getLocationPoint() {
            return location_point;
        }

        /**
         * set the location
         * @param location_point in the form [longitude, latitude]
         */
        public void setLocationPoint(double[] location_point) {
            this.location_point = location_point;
        }

        public String getPostId() {
            return String.valueOf(this.postId);
        }

        //TODO: to implement this method
        private void setPostId() {
            this.postId = String.valueOf(this.timestamp) + String.valueOf(this.created_at.getTime());
        }

        /**
         * @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint()
         */
        public double[] getLocationMark() {
            return location_mark;
        }

        /**
         * Set the location
         * @param location_point in the form [longitude, latitude]
         */
        public void setLocationMark(double[] location_mark) {
            this.location_mark = location_mark;
        }

        /**
         * Get the radius in meter
         * @return radius in meter around getLocationPoint() (NOT getLocationMark())
         */
        public int getLocationRadius() {
            return location_radius;
        }

        public void setLocationRadius(int location_radius) {
            this.location_radius = location_radius;
        }

        public LocationSource getLocationSource() {
            return location_source;
        }

        public void setLocationSource(LocationSource location_source) {
            this.location_source = location_source;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String text) {
            this.text = text;
        }

        public int getTextLength() {
            return this.text.length();
        }

        public long getId() {
            return Long.parseLong(this.postId);
        }

        public List<String> getHosts() {
            return this.hosts;
        }

        public Set<String> getVideos() {
            return this.videos;
        }

        public Set<String> getAudio() {
            return this.audios;
        }

        public Set<String> getImages() {
            return this.images;
        }

        public void setImages(String image) {
            if (this.images == null) {
                this.images = new HashSet<String>();
            }
            this.images.add(image);
        }

        public String[] getMentions() {
            if (this.mentions == null) {
                return new String[0];
            }
            return this.mentions.toArray(new String[0]);
        }

        public String[] getHashtags() {
            return this.hashtags.toArray(new String[0]);
        }

        public String[] getLinks() {
            return this.links.toArray(new String[0]);
        }

        public Classifier.Category getClassifier(Classifier.Context context) {
            return this.moreData.getClassifier(context);
        }

    }

    public static String unshorten(String text) {
        while (true) {
            try {
                Matcher m = emoji_pattern.matcher(text);
                if (m.find()) {
                    String emoji = m.group(1);
                    text = m.replaceFirst(emoji);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = emoji_pattern_span.matcher(text);
                if (m.find()) {
                    String emoji = m.group(1);
                    text = m.replaceFirst(emoji);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = hashtag_pattern.matcher(text);
                if (m.find()) {
                    text = m.replaceFirst(" #" + m.group(1) + " "); // the extra spaces are needed because twitter removes them if the hashtag is followed with a link
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = timeline_link_pattern.matcher(text);
                if (m.find()) {
                    String expanded = RedirectUnshortener.unShorten(m.group(2));
                    text = m.replaceFirst(" " + expanded);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = timeline_embed_pattern.matcher(text);
                if (m.find()) {
                    text = m.replaceFirst("");
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            break;
        }
        text = cleanup_pattern.matcher(text).replaceAll("");
        text = MessageEntry.html2utf8(text);
        text = doublespace_pattern.matcher(text).replaceAll(" ");
        text = text.trim();
        return text;
    }

    /**
     * Usage: java twitter4j.examples.search.SearchTweets [query]
     *
     * @param args search query
     */
    public static void main(String[] args) {
        //wget --no-check-certificate "https://twitter.com/search?q=eifel&src=typd&f=realtime"
        ArrayList<String> filterList = new ArrayList<String>();
        filterList.add("image");
        Timeline[] result = null;
        if (args[0].startsWith("/"))
            result = parse(new File(args[0]), Timeline.Order.CREATED_AT, true, true);
        else
            result = TwitterScraper.search(args[0], filterList, Timeline.Order.CREATED_AT, true, true);
        int all = 0;
        for (int x = 0; x < 2; x++) {
            if (x == 0)
                System.out.println("Timeline[0] - finished to be used:");
            if (x == 1)
                System.out.println("Timeline[1] - messages which are in postprocessing");
            all += result[x].size();
            for (TwitterTweet tweet : result[x]) {
                tweet.waitReady(10000);
                System.out.println(tweet.getCreatedAt().toString() + " from @" + tweet.getScreenName() + " - "
                        + tweet.getText());
            }
        }
        System.out.println("count: " + all);
        System.exit(0);
    }

}