org.loklak.api.search.TweetScraper.java Source code

Java tutorial

Introduction

Here is the source code for org.loklak.api.search.TweetScraper.java

Source

/**
 *  TwitterScraper
 *  Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.api.search;

import org.loklak.objects.AbstractObjectEntry;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.json.JSONObject;
import org.loklak.data.DAO;
import org.loklak.harvester.BaseScraper;
import org.loklak.harvester.Post;
import org.loklak.harvester.RedirectUnshortener;
import org.loklak.objects.Timeline2;
import org.loklak.server.BaseUserRole;
import java.net.URISyntaxException;
import org.apache.http.client.utils.URIBuilder;

import java.io.BufferedReader;
import java.io.IOException;
import org.json.JSONException;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.ProviderType;
import org.loklak.objects.SourceType;
import org.loklak.objects.UserEntry;

public class TweetScraper extends BaseScraper {

    /**
     * 
     */
    private static final long serialVersionUID = -3803127453010542460L;
    private static final Pattern emoji_pattern_span = Pattern.compile(
            "<span [^>]*class=\"Emoji Emoji--forLinks\" [^>]*>[\\n]*[^<]*</span>[\\n]*<span [^>]*class=\"visuallyhidden\" [^>]*aria-hidden=\"true\"[^>]*>[\\n]*([^<]*)[\\n]*</span>");
    private final static Pattern hashtag_pattern = Pattern
            .compile("<a href=\"/hashtag/.*?\".*?class=\"twitter-hashtag.*?\".*?><s>#</s><b>(.*?)</b></a>");
    private final static Pattern timeline_link_pattern = Pattern.compile(
            "<a href=\"https://(.*?)\".*? data-expanded-url=\"(.*?)\".*?twitter-timeline-link.*?title=\"(.*?)\".*?>.*?</a>");
    private final static Pattern timeline_embed_pattern = Pattern
            .compile("<a href=\"(https://t.co/\\w+)\" class=\"twitter-timeline-link.*?>pic.twitter.com/(.*?)</a>");
    private final static Pattern emoji_pattern = Pattern
            .compile("<img .*?class=\"Emoji Emoji--forText\".*?alt=\"(.*?)\".*?>");
    private final static Pattern doublespace_pattern = Pattern.compile("  ");
    private final static Pattern cleanup_pattern = Pattern.compile("</?(s|b|strong)>|" + "<a href=\"/hashtag.*?>|"
            + "<a.*?class=\"twitter-atreply.*?>|" + "<span.*?span>");

    private ArrayList<String> filterList = null;
    private String since = null;
    private String until = null;
    //TODO: implement with enriched data
    private boolean enrich = false;

    public TweetScraper() {
        super();
        this.baseUrl = "https://www.twitter.com/";
        this.scraperName = "twitter";
    }

    public TweetScraper(String _query) {
        this();
        this.setExtraValue("query", this.query);
        this.setParam();
    }

    public TweetScraper(String _query, Map<String, String> _extra) {
        this();
        this.setExtra(_extra);
        this.setParam();
        this.query = _query;
        this.setExtraValue("query", this.query);
    }

    public TweetScraper(Map<String, String> _extra) {
        this();
        this.setExtra(_extra);
        this.setParam();
    }

    protected void setParam() {
        // filter get argument
        this.filterList = new ArrayList<String>(Arrays.asList(this.getExtraValue("filter").split(",")));
        this.since = "".equals(this.getExtraValue("since")) ? null : this.getExtraValue("since");
        this.until = "".equals(this.getExtraValue("until")) ? null : this.getExtraValue("until");
        this.query = this.getExtraValue("query");
        this.enrich = this.getExtraValue("enrich").equals("true");
    }

    @Override
    public String getAPIPath() {
        return "/api/twitterscraper";
    }

    @Override
    public BaseUserRole getMinimalBaseUserRole() {
        return BaseUserRole.ANONYMOUS;
    }

    @Override
    public JSONObject getDefaultPermissions(BaseUserRole baseUserRole) {
        return null;
    }

    protected Map<?, ?> getExtra(String _extra) {
        return new HashMap<String, String>();
    }

    protected String prepareSearchUrl(String type) {
        URIBuilder url = null;
        String typeMedia = "tweets";
        String midUrl = "search/";

        if (this.since != null) {
            this.query = this.query + " " + "since:" + this.since;
        }
        if (this.until != null) {
            this.query = this.query + " " + "until:" + this.until;
        }

        if (this.filterList.contains("video") && this.filterList.size() == 1) {
            typeMedia = "video";
        }

        try {
            url = new URIBuilder(this.baseUrl + midUrl);
            switch (type) {
            case "user":
                typeMedia = "users";
                break;
            case "tweet":
                typeMedia = "tweets";
                break;
            case "image":
                typeMedia = "images";
                break;
            case "video":
                typeMedia = "videos";
                break;
            default:
                typeMedia = "tweets";
                break;
            }

            url.addParameter("f", typeMedia);
            url.addParameter("q", this.query);
            url.addParameter("vertical", "default");
            url.addParameter("src", "typd");
        } catch (URISyntaxException e) {
            DAO.log("Invalid Url: baseUrl = " + this.baseUrl + ", mid-URL = " + midUrl + ", query = " + this.query
                    + ", type = " + type);
        }
        return url.toString();
    }

    @Override
    public Post getResults() {
        String url;
        Post output = null;
        String type = this.getExtraValue("type");
        url = this.prepareSearchUrl(type);

        try {
            output = this.getDataFromConnection(url, type);
        } catch (IOException e) {
            DAO.severe("Possibly connection issue!!");
        }
        // Add scraper name
        Post postArray = new Post();
        postArray.put(this.scraperName, output);

        return postArray;
    }

    protected Post scrape(BufferedReader br, String type, String url) {
        Post typeArray = new Post(true);
        try {
            this.putData(typeArray, type, this.search(br, url));
        } catch (IOException e) {
        }
        return typeArray;
    }

    /**
     * scrape messages from the reader stream: this already checks if a message is new. There are only new messages returned
     * @param br
     * @param order
     * @return two timelines in one array: Timeline[0] is the one which is finished to be used, Timeline[1] contains messages which are in postprocessing
     * @throws IOException
     */
    private Timeline2 search(final BufferedReader br, String url) throws IOException {
        Timeline2 timelineReady = new Timeline2(order);
        String input;
        Map<String, prop> props = new HashMap<String, prop>();
        Set<String> images = null;
        Set<String> videos = null;
        String place_id = "";
        String place_name = "";
        boolean parsing_favourite = false;
        boolean parsing_retweet = false;
        // first line is 1, according to emacs which numbers the first line also as 1
        int line = 0;

        while ((input = br.readLine()) != null) {
            line++;
            input = input.trim();

            if (input.length() == 0)
                continue;

            // parse from HTML
            int p;
            if ((p = input.indexOf("=\"account-group")) > 0) {
                props.put("userid", new prop(input, p, "data-user-id"));
                continue;
            }
            if ((p = input.indexOf("class=\"avatar js-action-profile-avatar")) > 0) {
                props.put("useravatarurl", new prop(input, p, "src"));
                continue;
            }
            if ((p = input.indexOf("data-name=")) >= 0) {
                props.put("userfullname", new prop(input, p, "data-name"));
                // don't continue here, username is in the same line
            }
            if ((p = input.indexOf("data-screen-name=")) >= 0) {
                props.put("usernickname", new prop(input, p, "data-screen-name"));
                // don't continue here, fullname is in the same line
            }
            if ((p = input.indexOf("class=\"tweet-timestamp")) > 0) {
                props.put("tweetstatusurl", new prop(input, 0, "href"));
                props.put("tweettimename", new prop(input, p, "title"));
                // don't continue here because "class=\"_timestamp" is in the same line
            }
            if ((p = input.indexOf("class=\"_timestamp")) > 0) {
                props.put("tweettimems", new prop(input, p, "data-time-ms"));
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-action--retweet")) > 0) {
                parsing_retweet = true;
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-action--favorite")) > 0) {
                parsing_favourite = true;
                continue;
            }
            if ((p = input.indexOf("class=\"TweetTextSize")) > 0) {
                // read until closing p tag to account for new lines in tweets
                while (input.lastIndexOf("</p>") == -1) {
                    input = input + ' ' + br.readLine();
                }
                prop tweettext = new prop(input, p, null);
                props.put("tweettext", tweettext);
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-actionCount")) > 0) {
                if (parsing_retweet) {
                    prop tweetretweetcount = new prop(input, p, "data-tweet-stat-count");
                    props.put("tweetretweetcount", tweetretweetcount);
                    parsing_retweet = false;
                }
                if (parsing_favourite) {
                    props.put("tweetfavouritecount", new prop(input, p, "data-tweet-stat-count"));
                    parsing_favourite = false;
                }
                continue;
            }
            // get images
            String img_link;
            if (videos == null)
                images = new HashSet<String>();
            if ((p = input.indexOf("<img")) >= 0) {
                img_link = new prop(input, p, "src").value;
                if (img_link != null && img_link.contains("pbs.twimg.com/media/")) {
                    images.add(img_link);
                    continue;
                }

                continue;
            }
            // we have two opportunities to get video thumbnails == more images; images in the
            // presence of video content should be treated as thumbnail for the video
            if (videos == null)
                videos = new HashSet<String>();
            if ((p = input.indexOf("class=\"animated-gif-thumbnail\"")) > 0) {
                String image_url = new prop(input, 0, "src").value;
                images.add(image_url);
                continue;
            }
            if ((p = input.indexOf("class=\"animated-gif\"")) > 0) {
                String image_url = new prop(input, p, "poster").value;
                images.add(image_url);
                continue;
            }
            if ((p = input.indexOf("<source video-src")) >= 0 && input.indexOf("type=\"video/") > p) {
                String video_url = new prop(input, p, "video-src").value;
                videos.add(video_url);
                continue;
            }
            /** Not a good idea to fetch video right now. Need to add another endpoint which
              * lets end users fetch complete videos from here.
              * See https://github.com/loklak/loklak_server/issues/1298
            if (input.indexOf("AdaptiveMedia-videoContainer") > 0) {
            String tweetUrl = props.get("tweetstatusurl").value;
            String[] videoUrls = fetchTwitterVideos(tweetUrl);
            Collections.addAll(videos, videoUrls);
            }
            */
            if ((p = input.indexOf("class=\"Tweet-geo")) > 0) {
                prop place_name_prop = new prop(input, p, "title");
                place_name = place_name_prop.value;
                continue;
            }
            if ((p = input.indexOf("class=\"ProfileTweet-actionButton u-linkClean js-nav js-geo-pivot-link")) > 0) {
                prop place_id_prop = new prop(input, p, "data-place-id");
                place_id = place_id_prop.value;
                continue;
            }

            if (props.size() > 4 && input.indexOf("stream-item") > 0) {
                if (!filterPosts(props, videos, images)) {
                    props = new HashMap<String, prop>();
                    place_id = "";
                    place_name = "";
                    continue;
                }

                // the tweet is complete, evaluate the result
                prop userid = props.get("userid");
                if (userid == null)
                    continue;
                prop usernickname = props.get("usernickname");
                if (usernickname == null)
                    continue;
                prop useravatarurl = props.get("useravatarurl");
                if (useravatarurl == null)
                    continue;
                prop userfullname = props.get("userfullname");
                if (userfullname == null)
                    continue;

                UserEntry user = new UserEntry(userid.value, usernickname.value, useravatarurl.value,
                        MessageEntry.html2utf8(userfullname.value));

                prop tweettimems = props.get("tweettimems");
                if (tweettimems == null)
                    continue;
                prop tweetretweetcount = props.get("tweetretweetcount");
                if (tweetretweetcount == null)
                    continue;
                prop tweetfavouritecount = props.get("tweetfavouritecount");
                if (tweetfavouritecount == null)
                    continue;

                TweetPost tweet = new TweetPost(user.getScreenName(), Long.parseLong(tweettimems.value),
                        props.get("tweettimename").value, props.get("tweetstatusurl").value,
                        props.get("tweettext").value, Long.parseLong(tweetretweetcount.value),
                        Long.parseLong(tweetfavouritecount.value), images, videos, place_name, place_id, user, url);

                timelineReady.addPost(tweet);

                videos = null;
                images = null;
                props.clear();

                continue;
            }
        }
        br.close();
        return timelineReady;
    }

    /**
     * Filter Posts(here tweets) according to values.
     *   image: filter tweets with images, neglect 'tweets without images'
     *   video: filter tweets also having video and other values like image. For only value as video,
     *          tweets with videos are filtered in prepareUrl() method
     */
    private boolean filterPosts(Map<String, prop> props, Set<String> videos, Set<String> images) {
        if (this.filterList == null)
            return false;

        Matcher matchVideo1;
        Matcher matchVideo2;
        Pattern[] videoUrlPatterns = { Pattern.compile("youtu.be\\/[0-9A-z]+"),
                Pattern.compile("youtube.com\\/watch?v=[0-9A-z]+") };

        // Filter tweets with videos and others
        if (this.filterList.contains("video") && this.filterList.size() > 1) {
            matchVideo1 = videoUrlPatterns[0].matcher(props.get("tweettext").value);
            matchVideo2 = videoUrlPatterns[1].matcher(props.get("tweettext").value);

            if (!matchVideo1.find() && !matchVideo2.find() && videos.size() < 1) {
                return false;
            }
        }
        // Filter tweets with images
        if (this.filterList.contains("image") && images.size() < 1) {
            return false;
        }

        //TODO: Add more filters

        return true;
    }

    private static class prop {
        public String key, value = null;

        public prop(String value) {
            this.key = null;
            this.value = value;
        }

        public prop(String line, int start, String key) {
            this.key = key;
            if (key == null) {
                int p = line.indexOf('>', start);
                if (p > 0) {
                    int c = 1;
                    int q = p + 1;
                    while (c > 0 && q < line.length()) {
                        char a = line.charAt(q);
                        if (a == '<') {
                            if (line.charAt(q + 1) != 'i') {
                                if (line.charAt(q + 1) == '/')
                                    c--;
                                else
                                    c++;
                            }
                        }
                        q++;
                    }
                    assert p >= -1;
                    assert q > 0;
                    try {
                        value = line.substring(p + 1, q - 1);
                    } catch (StringIndexOutOfBoundsException e) {
                        DAO.debug(e);
                    }
                }
            } else {
                int p = line.indexOf(key + "=\"", start);
                if (p >= 0) {
                    int q = line.indexOf('"', p + key.length() + 2);
                    if (q > 0) {
                        value = line.substring(p + key.length() + 2, q);
                    }
                }
            }
        }

        @SuppressWarnings("unused")
        public boolean success() {
            return value != null;
        }

        public String toString() {
            return this.key + "=" + (this.value == null ? "unknown" : this.value);
        }
    }

    public static class TweetPost extends Post {

        public UserEntry user;
        // a time stamp that is given in loklak upon the arrival of the tweet which is the current local time
        public Date timestampDate;
        // the time given in the tweet which is the time when the user created it.
        // This is also use to do the index partition into minute, hour, week
        public Date created_at;
        // where did the message come from
        protected SourceType source_type;
        // who created the message
        protected ProviderType provider_type;
        private String searchUrl;
        public String screen_name, retweet_from, postId, canonical_id, parent, text;
        protected URL status_id_url;
        protected long retweet_count, favourites_count;
        public Set<String> images, audios, videos;
        protected String place_name, place_id;
        private boolean enriched;

        public TweetPost(final String user_screen_name_raw, final long created_at_raw,
                // Not used here but should be compared to created_at_raw
                final String created_at_name_raw, final String status_id_url_raw, final String text_raw,
                final long retweets, final long favourites, final Set<String> images, final Set<String> videos,
                final String place_name, final String place_id, final UserEntry user, String url)
                throws MalformedURLException {
            super();
            this.source_type = SourceType.TWITTER;
            this.provider_type = ProviderType.SCRAPED;
            this.screen_name = user_screen_name_raw;
            this.created_at = new Date(created_at_raw);
            this.status_id_url = new URL("https://twitter.com" + status_id_url_raw);
            int p = status_id_url_raw.lastIndexOf('/');
            this.postId = p >= 0 ? status_id_url_raw.substring(p + 1) : "-1";
            this.retweet_count = retweets;
            this.favourites_count = favourites;
            this.images = images;
            this.videos = videos;
            this.text = text_raw;
            this.user = user;
            this.searchUrl = url;

            // Set to json
            this.toJson();
        }

        public TweetPost(JSONObject json, boolean enrich) {
            Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME);
            this.timestampDate = MessageEntry.parseDate(timestamp_obj);
            this.timestamp = this.timestampDate.getTime();
            Object created_at_obj = lazyGet(json, AbstractObjectEntry.CREATED_AT_FIELDNAME);
            this.created_at = MessageEntry.parseDate(created_at_obj);
            String source_type_string = (String) lazyGet(json, "source_type");
            try {
                this.source_type = source_type_string == null ? SourceType.GENERIC
                        : SourceType.byName(source_type_string);
            } catch (IllegalArgumentException e) {
                this.source_type = SourceType.GENERIC;
            }
            String provider_type_string = (String) lazyGet(json, "provider_type");
            if (provider_type_string == null)
                provider_type_string = ProviderType.NOONE.name();
            try {
                this.provider_type = ProviderType.valueOf(provider_type_string);
            } catch (IllegalArgumentException e) {
                this.provider_type = ProviderType.NOONE;
            }
            this.screen_name = (String) lazyGet(json, "screen_name");
            this.retweet_from = (String) lazyGet(json, "retweet_from");
            this.postId = (String) lazyGet(json, "id_str");
            this.text = (String) lazyGet(json, "text");
            try {
                this.status_id_url = new URL((String) lazyGet(json, "link"));
            } catch (MalformedURLException e) {
                this.status_id_url = null;
            }
            this.retweet_count = MessageEntry.parseLong((Number) lazyGet(json, "retweet_count"));
            this.favourites_count = MessageEntry.parseLong((Number) lazyGet(json, "favourites_count"));
            this.images = MessageEntry.parseArrayList(lazyGet(json, "images"));
            this.audios = MessageEntry.parseArrayList(lazyGet(json, "audio"));
            this.videos = MessageEntry.parseArrayList(lazyGet(json, "videos"));
            this.enriched = false;
        }

        public TweetPost() throws MalformedURLException {
            this.timestamp = new Date().getTime();
            this.timestampDate = new Date(this.timestamp);
            this.created_at = new Date();
            this.source_type = SourceType.GENERIC;
            this.provider_type = ProviderType.NOONE;
            this.screen_name = "";
            this.retweet_from = "";
            this.postId = "";
            this.canonical_id = "";
            this.parent = "";
            this.text = "";
            this.status_id_url = null;
            this.retweet_count = 0;
            this.favourites_count = 0;
            this.images = new HashSet<String>();
            this.audios = new HashSet<String>();
            this.videos = new HashSet<String>();
            this.place_id = "";
            this.place_name = "";
            this.enriched = false;
        }

        public Post toJson() {

            this.text = unshorten(this.text);
            this.user.setName(unshorten(this.user.getName()));
            // tweet data
            this.put("timestamp", AbstractObjectEntry.utcFormatter.print(getTimestampDate().getTime()));
            this.put("created_at", AbstractObjectEntry.utcFormatter.print(getCreatedAt().getTime()));
            this.put("text", this.text);
            if (this.status_id_url == null) {
                this.put("link", "");
            } else {
                this.put("link", this.status_id_url.toExternalForm());
            }
            this.put("images", images);
            this.put("images_count", images.size());
            this.put("videos", videos);
            this.put("videos_count", videos.size());
            this.put("id_str", this.postId);

            this.put("retweet_count", this.retweet_count);
            this.put("favourites_count", this.favourites_count);

            // Add places
            this.put("place_name", this.place_name);
            this.put("place_id", this.place_id);
            this.put("search_url", this.searchUrl);
            // Add user
            this.put("user", user.toJSON());
            return this;
        }

        public Object lazyGet(JSONObject json, String key) {
            try {
                Object o = json.get(key);
                return o;
            } catch (JSONException e) {
                return null;
            }
        }

        public UserEntry getUser() {
            return this.user;
        }

        public Date getTimestampDate() {
            return this.timestampDate == null ? new Date() : this.timestampDate;
        }

        public Date getCreatedAt() {
            return this.created_at == null ? new Date() : this.created_at;
        }

        public void setCreatedAt(Date created_at) {
            this.created_at = created_at;
        }

        public String getScreenName() {
            return screen_name;
        }

        public void setScreenName(String user_screen_name) {
            this.screen_name = user_screen_name;
        }

        public URL getStatusIdUrl() {
            return this.status_id_url;
        }

        public void setStatusIdUrl(URL status_id_url) {
            this.status_id_url = status_id_url;
        }

        public long getRetweetCount() {
            return retweet_count;
        }

        public void setRetweetCount(long retweet_count) {
            this.retweet_count = retweet_count;
        }

        public long getFavouritesCount() {
            return this.favourites_count;
        }

        public void setFavouritesCount(long favourites_count) {
            this.favourites_count = favourites_count;
        }

        //TODO: to implement this method
        private void setPostId() {
            this.postId = String.valueOf(this.timestamp) + String.valueOf(this.created_at.getTime());
        }

        public String getText() {
            return this.text;
        }

        public void setText(String text) {
            this.text = text;
        }

        public int getTextLength() {
            return this.text.length();
        }

        public long getId() {
            return Long.parseLong(this.postId);
        }
    }

    public static String unshorten(String text) {
        while (true) {
            try {
                Matcher m = emoji_pattern.matcher(text);
                if (m.find()) {
                    String emoji = m.group(1);
                    text = m.replaceFirst(emoji);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = emoji_pattern_span.matcher(text);
                if (m.find()) {
                    String emoji = m.group(1);
                    text = m.replaceFirst(emoji);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = hashtag_pattern.matcher(text);
                if (m.find()) {
                    text = m.replaceFirst(" #" + m.group(1) + " "); // the extra spaces are needed because twitter removes them if the hashtag is followed with a link
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = timeline_link_pattern.matcher(text);
                if (m.find()) {
                    String expanded = RedirectUnshortener.unShorten(m.group(2));
                    text = m.replaceFirst(" " + expanded);
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
            try {
                Matcher m = timeline_embed_pattern.matcher(text);
                if (m.find()) {
                    text = m.replaceFirst("");
                    continue;
                }
            } catch (Throwable e) {
                DAO.severe(e);
                break;
            }
        }
        text = cleanup_pattern.matcher(text).replaceAll("");
        text = MessageEntry.html2utf8(text);
        text = doublespace_pattern.matcher(text).replaceAll(" ");
        text = text.trim();
        return text;
    }
}