org.loklak.android.data.MessageEntry.java Source code

Java tutorial

Introduction

Here is the source code for org.loklak.android.data.MessageEntry.java

Source

/**
 *  MessageEntry
 *  Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
 *  This class is the android version from the original file,
 *  taken from the loklak_server project. It may be slightly different.
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; wo even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.android.data;

import org.json.JSONException;
import org.json.JSONObject;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MessageEntry extends AbstractIndexEntry {

    public static final String RICH_TEXT_SEPARATOR = "\n***\n";

    protected Date created_at, on, to; // created_at will allways be set, on means 'valid from' and 'to' means 'valid_until' and may not be set
    protected String provider_hash, screen_name, retweet_from, id_str, canonical_id, parent, text;
    protected URL status_id_url;
    protected long retweet_count, favourites_count;
    protected LinkedHashSet<String> images, audio, videos;
    protected String place_name, place_id;

    // the following fields are either set as a common field or generated by extraction from field 'text' or from field 'place_name'
    protected double[] location_point, location_mark; // coordinate order is [longitude, latitude]
    protected int location_radius; // meter
    protected String place_country;

    // the following can be computed from the tweet data but is stored in the search index to provide statistical data and ranking attributes
    private int without_l_len, without_lu_len, without_luh_len; // the length of tweets without links, users, hashtags
    private String[] hosts, links, mentions, hashtags; // the arrays of links, users, hashtags

    public MessageEntry() throws MalformedURLException {
        this.created_at = new Date();
        this.on = null;
        this.to = null;
        this.provider_hash = "";
        this.screen_name = "";
        this.retweet_from = "";
        this.id_str = "";
        this.canonical_id = "";
        this.parent = "";
        this.text = "";
        this.status_id_url = null;
        this.retweet_count = 0;
        this.favourites_count = 0;
        this.images = new LinkedHashSet<String>();
        this.audio = new LinkedHashSet<String>();
        this.videos = new LinkedHashSet<String>();
        this.place_id = "";
        this.place_name = "";
        this.place_country = null;
        this.location_point = null;
        this.location_radius = 0;
        this.location_mark = null;
        this.without_l_len = 0;
        this.without_lu_len = 0;
        this.without_luh_len = 0;
        this.hosts = new String[0];
        this.links = new String[0];
        this.mentions = new String[0];
        this.hashtags = new String[0];
    }

    public MessageEntry(JSONObject json) {
        Object created_at_obj = lazyGet(json, "created_at");
        this.created_at = parseDate(created_at_obj);
        Object on_obj = lazyGet(json, "on");
        this.on = on_obj == null ? null : parseDate(on);
        Object to_obj = lazyGet(json, "to");
        this.to = to_obj == null ? null : parseDate(to);
        String source_type_string = (String) lazyGet(json, "source_type");
        String provider_type_string = (String) lazyGet(json, "provider_type");
        this.provider_hash = (String) lazyGet(json, "provider_hash");
        this.screen_name = (String) lazyGet(json, "screen_name");
        this.retweet_from = (String) lazyGet(json, "retweet_from");
        this.id_str = (String) lazyGet(json, "id_str");
        this.text = (String) lazyGet(json, "text");
        try {
            this.status_id_url = new URL((String) lazyGet(json, "link"));
        } catch (MalformedURLException e) {
            this.status_id_url = null;
        }
        this.retweet_count = parseLong((Number) lazyGet(json, "retweet_count"));
        this.favourites_count = parseLong((Number) lazyGet(json, "favourites_count"));
        this.images = parseArrayList(lazyGet(json, "images"));
        this.audio = parseArrayList(lazyGet(json, "audio"));
        this.videos = parseArrayList(lazyGet(json, "videos"));
        this.place_id = parseString((String) lazyGet(json, "place_id"));
        this.place_name = parseString((String) lazyGet(json, "place_name"));
        this.place_country = parseString((String) lazyGet(json, "place_country"));
        if (this.place_country != null && this.place_country.length() != 2)
            this.place_country = null;

        // optional location
        Object location_point_obj = lazyGet(json, "location_point");
        Object location_radius_obj = lazyGet(json, "location_radius");
        Object location_mark_obj = lazyGet(json, "location_mark");
        Object location_source_obj = lazyGet(json, "location_source");
        if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>)
                || !(location_mark_obj instanceof List<?>)) {
            this.location_point = null;
            this.location_radius = 0;
            this.location_mark = null;
        } else {
            this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0),
                    (Double) ((List<?>) location_point_obj).get(1) };
            this.location_radius = (int) parseLong((Number) location_radius_obj);
            this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0),
                    (Double) ((List<?>) location_mark_obj).get(1) };
        }

        // load enriched data
        enrich();
    }

    public MessageEntry(Map<String, Object> map) {
        Object created_at_obj = map.get("created_at");
        this.created_at = parseDate(created_at_obj);
        Object on_obj = map.get("on");
        this.on = on_obj == null ? null : parseDate(on);
        Object to_obj = map.get("to");
        this.to = to_obj == null ? null : parseDate(to);
        String source_type_string = (String) map.get("source_type");
        String provider_type_string = (String) map.get("provider_type");
        this.provider_hash = (String) map.get("provider_hash");
        this.screen_name = (String) map.get("screen_name");
        this.retweet_from = (String) map.get("retweet_from");
        this.id_str = (String) map.get("id_str");
        this.text = (String) map.get("text");
        try {
            this.status_id_url = new URL((String) map.get("link"));
        } catch (MalformedURLException e) {
            this.status_id_url = null;
        }
        this.retweet_count = parseLong((Number) map.get("retweet_count"));
        this.favourites_count = parseLong((Number) map.get("favourites_count"));
        this.images = parseArrayList(map.get("images"));
        this.audio = parseArrayList(map.get("audio"));
        this.videos = parseArrayList(map.get("videos"));
        this.place_id = parseString((String) map.get("place_id"));
        this.place_name = parseString((String) map.get("place_name"));
        this.place_country = parseString((String) map.get("place_country"));
        if (this.place_country != null && this.place_country.length() != 2)
            this.place_country = null;

        // optional location
        Object location_point_obj = map.get("location_point");
        Object location_radius_obj = map.get("location_radius");
        Object location_mark_obj = map.get("location_mark");
        Object location_source_obj = map.get("location_source");
        if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>)
                || !(location_mark_obj instanceof List<?>)) {
            this.location_point = null;
            this.location_radius = 0;
            this.location_mark = null;
        } else {
            this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0),
                    (Double) ((List<?>) location_point_obj).get(1) };
            this.location_radius = (int) parseLong((Number) location_radius_obj);
            this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0),
                    (Double) ((List<?>) location_mark_obj).get(1) };
        }

        // load enriched data
        enrich();
    }

    public Date getCreatedAt() {
        return this.created_at == null ? new Date() : this.created_at;
    }

    public void setCreatedAt(Date created_at) {
        this.created_at = created_at;
    }

    public Date getOn() {
        return this.on;
    }

    public void setOn(Date on) {
        this.on = on;
    }

    public Date getTo() {
        return this.to;
    }

    public void setTo(Date to) {
        this.to = to;
    }

    public String getProviderHash() {
        return provider_hash;
    }

    public void setProviderHash(String provider_hash) {
        this.provider_hash = provider_hash;
    }

    public String getScreenName() {
        return screen_name;
    }

    public void setScreenName(String user_screen_name) {
        this.screen_name = user_screen_name;
    }

    public String getRetweetFrom() {
        return this.retweet_from;
    }

    public void setRetweetFrom(String retweet_from) {
        this.retweet_from = retweet_from;
    }

    public String getIdStr() {
        return id_str;
    }

    public void setIdStr(String id_str) {
        this.id_str = id_str;
    }

    public URL getStatusIdUrl() {
        return this.status_id_url;
    }

    public void setStatusIdUrl(URL status_id_url) {
        this.status_id_url = status_id_url;
    }

    public long getRetweetCount() {
        return retweet_count;
    }

    public void setRetweetCount(long retweet_count) {
        this.retweet_count = retweet_count;
    }

    public long getFavouritesCount() {
        return this.favourites_count;
    }

    public void setFavouritesCount(long favourites_count) {
        this.favourites_count = favourites_count;
    }

    public String getPlaceName() {
        return place_name;
    }

    public String getPlaceId() {
        return place_id;
    }

    public void setPlaceId(String place_id) {
        this.place_id = place_id;
    }

    /**
     * @return [longitude, latitude]
     */
    public double[] getLocationPoint() {
        return location_point;
    }

    /**
     * set the location
     * @param location_point in the form [longitude, latitude]
     */
    public void setLocationPoint(double[] location_point) {
        this.location_point = location_point;
    }

    /**
     * @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint()
     */
    public double[] getLocationMark() {
        return location_mark;
    }

    /**
     * set the location
     * @param location_mark in the form [longitude, latitude]
     */
    public void setLocationMark(double[] location_mark) {
        this.location_mark = location_mark;
    }

    /**
     * get the radius in meter
     * @return radius in meter around getLocationPoint() (NOT getLocationMark())
     */
    public int getLocationRadius() {
        return location_radius;
    }

    public void setLocationRadius(int location_radius) {
        this.location_radius = location_radius;
    }

    public void setText(String text) {
        this.text = text;
    }

    public void setImages(ArrayList<String> images) {
        this.images = parseArrayList(images);
    }

    public void setImages(String[] images) {
        this.images = parseArrayList(images);
    }

    public void setImages(String image) {
        this.images = parseArrayList(image);
    }

    public long getId() {
        return Long.parseLong(this.id_str);
    }

    public String[] getHosts() {
        return this.hosts;
    }

    public String getText(final int iflinkexceedslength, final String urlstub) {
        return this.text;
    }

    public String[] getMentions() {
        return this.mentions;
    }

    public String[] getHashtags() {
        return this.hashtags;
    }

    public String[] getLinks() {
        return this.links;
    }

    public Collection<String> getImages() {
        return this.images;
    }

    final static Pattern SPACEX_PATTERN = Pattern.compile("  +"); // two or more
    final static Pattern URL_PATTERN = Pattern.compile("(?:\\b|^)(https?://.*?)(?:[) ]|$)"); // right boundary must be space since others may appear in urls
    final static Pattern USER_PATTERN = Pattern.compile("(?:[ (]|^)(@..*?)(?:\\b|$)"); // left boundary must be space since the @ is itself a boundary
    final static Pattern HASHTAG_PATTERN = Pattern.compile("(?:[ (]|^)(#..*?)(?:\\b|$)"); // left boundary must be a space since the # is itself a boundary

    /**
     * create enriched data, useful for analytics and ranking:
     * - identify all mentioned users, hashtags and links
     * - count message size without links
     * - count message size without links and without users
     */
    public void enrich() {
        StringBuilder t = new StringBuilder(this.text);

        // extract the links
        List<String> links = extract(t, URL_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_l_len = t.length(); // len_no_l

        // extract the users
        List<String> users = extract(t, USER_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_lu_len = t.length(); // len_no_l_and_users

        // extract the hashtags
        List<String> hashtags = extract(t, HASHTAG_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_luh_len = t.length(); // len_no_l_and_users_and_hashtags

        // extract the hosts from the links
        Set<String> hosts = new LinkedHashSet<String>();
        for (String u : links) {
            try {
                URL url = new URL(u);
                hosts.add(url.getHost());
            } catch (MalformedURLException e) {
            }
        }

        this.hosts = new String[hosts.size()];
        int j = 0;
        for (String host : hosts)
            this.hosts[j++] = host.toLowerCase();

        this.mentions = new String[users.size()];
        for (int i = 0; i < users.size(); i++)
            this.mentions[i] = users.get(i).substring(1);

        this.hashtags = new String[hashtags.size()];
        for (int i = 0; i < hashtags.size(); i++)
            this.hashtags[i] = hashtags.get(i).substring(1).toLowerCase();

        this.links = new String[links.size()];
        for (int i = 0; i < links.size(); i++)
            this.links[i] = links.get(i);

        // more media data, analyze the links
        for (String link : this.links) {
            if (link.endsWith(".mp4") || link.endsWith(".m4v") || link.indexOf("vimeo.com") > 0
                    || link.indexOf("youtube.com") > 0 || link.indexOf("youtu.be") > 0
                    || link.indexOf("vine.co") > 0 || link.indexOf("ted.com") > 0) {
                this.videos.add(link);
                continue;
            }
            if (link.endsWith(".mp3") || link.indexOf("soundcloud.com") > 0) {
                this.audio.add(link);
                continue;
            }
            if (link.indexOf("flickr.com") > 0 || link.indexOf("instagram.com") > 0 || link.indexOf("imgur.com") > 0
                    || link.indexOf("giphy.com") > 0) {
                this.images.add(link);
                continue;
            }
        }
    }

    private static List<String> extract(StringBuilder s, Pattern p, int g) {
        Matcher m = p.matcher(s.toString());
        List<String> l = new ArrayList<String>();
        while (m.find())
            l.add(m.group(g));
        for (String r : l) {
            int i = s.indexOf(r);
            s.replace(i, i + r.length(), "");
        }
        return l;
    }

    public JSONObject toJSON(final UserEntry user, final boolean calculatedData, final int iflinkexceedslength,
            final String urlstub) throws JSONException {
        JSONObject json = new JSONObject();

        // tweet data
        json.put("created_at", utcFormatter.print(getCreatedAt().getTime()));
        if (this.on != null)
            json.put("on", utcFormatter.print(this.on.getTime()));
        if (this.to != null)
            json.put("to", utcFormatter.print(this.to.getTime()));
        json.put("screen_name", this.screen_name);
        if (this.retweet_from != null && this.retweet_from.length() > 0)
            json.put("retweet_from", this.retweet_from);
        json.put("text", this.getText(iflinkexceedslength, urlstub)); // the tweet; the cleanup is a helper function which cleans mistakes from the past in scraping
        if (this.status_id_url != null)
            json.put("link", this.status_id_url.toExternalForm());
        json.put("id_str", this.id_str);
        if (this.canonical_id != null)
            json.put("canonical_id", this.canonical_id);
        if (this.parent != null)
            json.put("parent", this.parent);
        if (this.provider_hash != null && this.provider_hash.length() > 0)
            json.put("provider_hash", this.provider_hash);
        json.put("retweet_count", this.retweet_count);
        json.put("favourites_count", this.favourites_count); // there is a slight inconsistency here in the plural naming but thats how it is noted in the twitter api
        json.put("images", this.images);
        json.put("images_count", this.images.size());
        json.put("audio", this.audio);
        json.put("audio_count", this.audio.size());
        json.put("videos", this.videos);
        json.put("videos_count", this.videos.size());
        json.put("place_name", this.place_name);
        json.put("place_id", this.place_id);

        // add statistic/calculated data
        if (calculatedData) {
            // location data
            if (this.place_country != null && this.place_country.length() == 2) {
                json.put("place_country_code", this.place_country);
            }

            // add optional location data. This is written even if calculatedData == false if the source is from REPORT to prevent that it is lost
            if (this.location_point != null && this.location_point.length == 2 && this.location_mark != null
                    && this.location_mark.length == 2) {
                // reference for this format: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-geo-point-type.html#_lat_lon_as_array_5
                json.put("location_point", this.location_point); // [longitude, latitude]
                json.put("location_radius", this.location_radius);
                json.put("location_mark", this.location_mark);
            }

            // redundant data for enhanced navigation with aggregations
            json.put("hosts", this.hosts);
            json.put("hosts_count", this.hosts.length);
            json.put("links", this.links);
            json.put("links_count", this.links.length);
            json.put("mentions", this.mentions);
            json.put("mentions_count", this.mentions.length);
            json.put("hashtags", this.hashtags);
            json.put("hashtags_count", this.hashtags.length);
        }

        // add user
        if (user != null)
            json.put("user", user.toJSON());
        return json;
    }

    public static String html2utf8(String s) {
        int p, q;
        // hex coding &#
        try {
            while ((p = s.indexOf("&#")) >= 0) {
                q = s.indexOf(';', p + 2);
                if (q < p)
                    break;
                String charcode = s.substring(p + 2, q);
                int unicode = s.charAt(0) == 'x' ? Integer.parseInt(charcode.substring(1), 16)
                        : Integer.parseInt(charcode);
                s = s.substring(0, p) + ((unicode == 10 || unicode == 13) ? "\n" : ((char) unicode))
                        + s.substring(q + 1);
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
        // octal coding \\u
        try {
            while ((p = s.indexOf("\\u")) >= 0 && s.length() >= p + 6) {
                char r = ((char) Integer.parseInt(s.substring(p + 2, p + 6), 8));
                if (r < ' ')
                    r = ' ';
                s = s.substring(0, p) + r + s.substring(p + 6);
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
        // remove tags
        s = s.replaceAll("</a>", "").replaceAll("&quot;", "\"").replaceAll("&amp;", "&");
        // remove funny symbols
        StringBuilder clean = new StringBuilder(s.length() + 5);
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (((int) c) == 8232 || c == '\n' || c == '\r')
                clean.append("\n");
            else if (c < ' ')
                clean.append(' ');
            else
                clean.append(c);
        }
        // remove double spaces
        return clean.toString().replaceAll("  ", " ");
    }
}