org.loklak.objects.MessageEntry.java Source code

Java tutorial

Introduction

Here is the source code for org.loklak.objects.MessageEntry.java

Source

/**
 *  MessageEntry
 *  Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *  
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; wo even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *  
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package org.loklak.objects;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.eclipse.jetty.util.log.Log;
import org.json.JSONObject;
import org.loklak.api.search.ShortlinkFromTweetServlet;
import org.loklak.data.Classifier;
import org.loklak.data.DAO;
import org.loklak.data.Classifier.Category;
import org.loklak.data.Classifier.Context;
import org.loklak.geo.GeoMark;
import org.loklak.geo.LocationSource;
import org.loklak.objects.QueryEntry.PlaceContext;
import org.loklak.tools.bayes.Classification;

public class MessageEntry extends AbstractObjectEntry implements ObjectEntry {

    public static final String RICH_TEXT_SEPARATOR = "\n***\n";

    protected Date timestamp, created_at, on, to; // created_at will allways be set, on means 'valid from' and 'to' means 'valid_until' and may not be set
    protected SourceType source_type; // where did the message come from
    protected ProviderType provider_type; // who created the message
    protected String provider_hash, screen_name, retweet_from, id_str, canonical_id, parent, text;
    protected URL status_id_url;
    protected long retweet_count, favourites_count;
    protected LinkedHashSet<String> images, audio, videos;
    protected String place_name, place_id;

    // the following fields are either set as a common field or generated by extraction from field 'text' or from field 'place_name'
    protected double[] location_point, location_mark; // coordinate order is [longitude, latitude]
    protected int location_radius; // meter
    protected LocationSource location_source;
    protected PlaceContext place_context;
    protected String place_country;
    private boolean enriched;

    // the following can be computed from the tweet data but is stored in the search index to provide statistical data and ranking attributes
    private int without_l_len, without_lu_len, without_luh_len; // the length of tweets without links, users, hashtags
    private String[] hosts, links, mentions, hashtags; // the arrays of links, users, hashtags
    private Map<Context, Classification<String, Category>> classifier;

    public MessageEntry() throws MalformedURLException {
        this.timestamp = new Date();
        this.created_at = new Date();
        this.on = null;
        this.to = null;
        this.source_type = SourceType.GENERIC;
        this.provider_type = ProviderType.NOONE;
        this.provider_hash = "";
        this.screen_name = "";
        this.retweet_from = "";
        this.id_str = "";
        this.canonical_id = "";
        this.parent = "";
        this.text = "";
        this.status_id_url = null;
        this.retweet_count = 0;
        this.favourites_count = 0;
        this.images = new LinkedHashSet<String>();
        this.audio = new LinkedHashSet<String>();
        this.videos = new LinkedHashSet<String>();
        this.place_id = "";
        this.place_name = "";
        this.place_context = null;
        this.place_country = null;
        this.location_point = null;
        this.location_radius = 0;
        this.location_mark = null;
        this.location_source = null;
        this.without_l_len = 0;
        this.without_lu_len = 0;
        this.without_luh_len = 0;
        this.hosts = new String[0];
        this.links = new String[0];
        this.mentions = new String[0];
        this.hashtags = new String[0];
        this.classifier = null;
        this.enriched = false;
    }

    public MessageEntry(JSONObject json) {
        Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME);
        this.timestamp = parseDate(timestamp_obj);
        Object created_at_obj = lazyGet(json, "created_at");
        this.created_at = parseDate(created_at_obj);
        Object on_obj = lazyGet(json, "on");
        this.on = on_obj == null ? null : parseDate(on);
        Object to_obj = lazyGet(json, "to");
        this.to = to_obj == null ? null : parseDate(to);
        String source_type_string = (String) lazyGet(json, "source_type");
        try {
            this.source_type = source_type_string == null ? SourceType.GENERIC
                    : SourceType.byName(source_type_string);
        } catch (IllegalArgumentException e) {
            this.source_type = SourceType.GENERIC;
        }
        String provider_type_string = (String) lazyGet(json, "provider_type");
        if (provider_type_string == null)
            provider_type_string = ProviderType.NOONE.name();
        try {
            this.provider_type = ProviderType.valueOf(provider_type_string);
        } catch (IllegalArgumentException e) {
            this.provider_type = ProviderType.NOONE;
        }
        this.provider_hash = (String) lazyGet(json, "provider_hash");
        this.screen_name = (String) lazyGet(json, "screen_name");
        this.retweet_from = (String) lazyGet(json, "retweet_from");
        this.id_str = (String) lazyGet(json, "id_str");
        this.text = (String) lazyGet(json, "text");
        try {
            this.status_id_url = new URL((String) lazyGet(json, "link"));
        } catch (MalformedURLException e) {
            this.status_id_url = null;
        }
        this.retweet_count = parseLong((Number) lazyGet(json, "retweet_count"));
        this.favourites_count = parseLong((Number) lazyGet(json, "favourites_count"));
        this.images = parseArrayList(lazyGet(json, "images"));
        this.audio = parseArrayList(lazyGet(json, "audio"));
        this.videos = parseArrayList(lazyGet(json, "videos"));
        this.place_id = parseString((String) lazyGet(json, "place_id"));
        this.place_name = parseString((String) lazyGet(json, "place_name"));
        this.place_country = parseString((String) lazyGet(json, "place_country"));
        if (this.place_country != null && this.place_country.length() != 2)
            this.place_country = null;

        // optional location
        Object location_point_obj = lazyGet(json, "location_point");
        Object location_radius_obj = lazyGet(json, "location_radius");
        Object location_mark_obj = lazyGet(json, "location_mark");
        Object location_source_obj = lazyGet(json, "location_source");
        if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>)
                || !(location_mark_obj instanceof List<?>)) {
            this.location_point = null;
            this.location_radius = 0;
            this.location_mark = null;
            this.location_source = null;
        } else {
            this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0),
                    (Double) ((List<?>) location_point_obj).get(1) };
            this.location_radius = (int) parseLong((Number) location_radius_obj);
            this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0),
                    (Double) ((List<?>) location_mark_obj).get(1) };
            this.location_source = LocationSource.valueOf((String) location_source_obj);
        }
        this.enriched = false;

        // load enriched data
        enrich();
    }

    public Date getTimestamp() {
        return this.timestamp == null ? new Date() : this.timestamp;
    }

    public Date getCreatedAt() {
        return this.created_at == null ? new Date() : this.created_at;
    }

    public void setCreatedAt(Date created_at) {
        this.created_at = created_at;
    }

    public Date getOn() {
        return this.on;
    }

    public void setOn(Date on) {
        this.on = on;
    }

    public Date getTo() {
        return this.to;
    }

    public void setTo(Date to) {
        this.to = to;
    }

    public SourceType getSourceType() {
        return this.source_type;
    }

    public void setSourceType(SourceType source_type) {
        this.source_type = source_type;
    }

    public ProviderType getProviderType() {
        return provider_type;
    }

    public void setProviderType(ProviderType provider_type) {
        this.provider_type = provider_type;
    }

    public String getProviderHash() {
        return provider_hash;
    }

    public void setProviderHash(String provider_hash) {
        this.provider_hash = provider_hash;
    }

    public String getScreenName() {
        return screen_name;
    }

    public void setScreenName(String user_screen_name) {
        this.screen_name = user_screen_name;
    }

    public String getRetweetFrom() {
        return this.retweet_from;
    }

    public void setRetweetFrom(String retweet_from) {
        this.retweet_from = retweet_from;
    }

    public String getIdStr() {
        return id_str;
    }

    public void setIdStr(String id_str) {
        this.id_str = id_str;
    }

    public URL getStatusIdUrl() {
        return this.status_id_url;
    }

    public void setStatusIdUrl(URL status_id_url) {
        this.status_id_url = status_id_url;
    }

    public long getRetweetCount() {
        return retweet_count;
    }

    public void setRetweetCount(long retweet_count) {
        this.retweet_count = retweet_count;
    }

    public long getFavouritesCount() {
        return this.favourites_count;
    }

    public void setFavouritesCount(long favourites_count) {
        this.favourites_count = favourites_count;
    }

    public String getPlaceName() {
        return place_name;
    }

    public PlaceContext getPlaceContext() {
        return place_context;
    }

    public void setPlaceName(String place_name, PlaceContext place_context) {
        this.place_name = place_name;
        this.place_context = place_context;
    }

    public String getPlaceId() {
        return place_id;
    }

    public void setPlaceId(String place_id) {
        this.place_id = place_id;
    }

    /**
     * @return [longitude, latitude]
     */
    public double[] getLocationPoint() {
        return location_point;
    }

    /**
     * set the location
     * @param location_point in the form [longitude, latitude]
     */
    public void setLocationPoint(double[] location_point) {
        this.location_point = location_point;
    }

    /**
     * @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint()
     */
    public double[] getLocationMark() {
        return location_mark;
    }

    /**
     * set the location
     * @param location_point in the form [longitude, latitude]
     */
    public void setLocationMark(double[] location_mark) {
        this.location_mark = location_mark;
    }

    /**
     * get the radius in meter
     * @return radius in meter around getLocationPoint() (NOT getLocationMark())
     */
    public int getLocationRadius() {
        return location_radius;
    }

    public void setLocationRadius(int location_radius) {
        this.location_radius = location_radius;
    }

    public LocationSource getLocationSource() {
        return location_source;
    }

    public void setLocationSource(LocationSource location_source) {
        this.location_source = location_source;
    }

    public void setText(String text) {
        this.text = text;
    }

    public void setImages(ArrayList<String> images) {
        this.images = parseArrayList(images);
    }

    public void setImages(String[] images) {
        this.images = parseArrayList(images);
    }

    public void setImages(String image) {
        this.images = parseArrayList(image);
    }

    public long getId() {
        return Long.parseLong(this.id_str);
    }

    public String[] getHosts() {
        return this.hosts;
    }

    public String getText(final int iflinkexceedslength, final String urlstub) {
        // check if we shall replace shortlinks
        String t = this.text;
        String[] links = this.getLinks();
        if (links != null) {
            linkloop: for (int nth = 0; nth < links.length; nth++) {
                String link = links[nth];
                if (link.length() > iflinkexceedslength) {
                    if (!DAO.existMessage(this.getIdStr()))
                        break linkloop;
                    t = t.replace(link, urlstub + "/x?id=" + this.getIdStr() + (nth == 0 ? ""
                            : ShortlinkFromTweetServlet.SHORTLINK_COUNTER_SEPERATOR + Integer.toString(nth)));
                }
            }
        }

        return t;
    }

    public String[] getMentions() {
        return this.mentions;
    }

    public String[] getHashtags() {
        return this.hashtags;
    }

    public String[] getLinks() {
        return this.links;
    }

    public Collection<String> getImages() {
        return this.images;
    }

    public Classifier.Category getClassifier(Classifier.Context context) {
        if (this.classifier == null)
            return null;
        Classification<String, Category> classification = this.classifier.get(context);
        if (classification == null)
            return null;
        return classification.getCategory() == Classifier.Category.NONE ? null : classification.getCategory();
    }

    public double getClassifierProbability(Classifier.Context context) {
        if (this.classifier == null)
            return 0.0d;
        Classification<String, Category> classification = this.classifier.get(context);
        if (classification == null)
            return 0.0d;
        return classification.getProbability();
    }

    final static Pattern SPACEX_PATTERN = Pattern.compile("  +"); // two or more
    final static Pattern URL_PATTERN = Pattern
            .compile("(?:\\b|^)(https?://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|])"); // right boundary must be space or ) since others may appear in urls
    final static Pattern USER_PATTERN = Pattern.compile("(?:[ (]|^)(@..*?)(?:\\b|$)"); // left boundary must be space since the @ is itself a boundary
    final static Pattern HASHTAG_PATTERN = Pattern.compile("(?:[ (]|^)(#..*?)(?:\\b|$)"); // left boundary must be a space since the # is itself a boundary

    /**
     * create enriched data, useful for analytics and ranking:
     * - identify all mentioned users, hashtags and links
     * - count message size without links
     * - count message size without links and without users
     */
    public void enrich() {
        if (this.enriched)
            return;
        StringBuilder t = new StringBuilder(this.text);

        // extract the links
        List<String> links = extract(t, URL_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_l_len = t.length(); // len_no_l

        // extract the users
        List<String> users = extract(t, USER_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_lu_len = t.length(); // len_no_l_and_users

        // extract the hashtags
        List<String> hashtags = extract(t, HASHTAG_PATTERN, 1);
        t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim());
        this.without_luh_len = t.length(); // len_no_l_and_users_and_hashtags

        // extract the hosts from the links
        Set<String> hosts = new LinkedHashSet<String>();
        for (String u : links) {
            try {
                URL url = new URL(u);
                hosts.add(url.getHost());
            } catch (MalformedURLException e) {
            }
        }

        this.hosts = new String[hosts.size()];
        int j = 0;
        for (String host : hosts)
            this.hosts[j++] = host.toLowerCase();

        this.mentions = new String[users.size()];
        for (int i = 0; i < users.size(); i++)
            this.mentions[i] = users.get(i).substring(1);

        this.hashtags = new String[hashtags.size()];
        for (int i = 0; i < hashtags.size(); i++)
            this.hashtags[i] = hashtags.get(i).substring(1).toLowerCase();

        this.links = new String[links.size()];
        for (int i = 0; i < links.size(); i++)
            this.links[i] = links.get(i);

        // classify content
        this.classifier = Classifier.classify(this.text);

        // more media data, analyze the links
        for (String link : this.links) {
            if (link.endsWith(".mp4") || link.endsWith(".m4v") || link.indexOf("vimeo.com") > 0
                    || link.indexOf("youtube.com") > 0 || link.indexOf("youtu.be") > 0
                    || link.indexOf("vine.co") > 0 || link.indexOf("ted.com") > 0) {
                this.videos.add(link);
                continue;
            }
            if (link.endsWith(".mp3") || link.indexOf("soundcloud.com") > 0) {
                this.audio.add(link);
                continue;
            }
            if (link.endsWith(".jpg") || link.endsWith(".jpeg") || link.endsWith(".png") || link.endsWith(".gif")
                    || link.indexOf("flickr.com") > 0 || link.indexOf("instagram.com") > 0
                    || link.indexOf("imgur.com") > 0 || link.indexOf("giphy.com") > 0
                    || link.indexOf("pic.twitter.com") > 0) {
                this.images.add(link);
                continue;
            }
        }

        // find location
        if ((this.location_point == null || this.location_point.length == 0) && DAO.geoNames != null) {
            GeoMark loc = null;
            if (this.place_name != null && this.place_name.length() > 0
                    && (this.location_source == null || this.location_source == LocationSource.ANNOTATION
                            || this.location_source == LocationSource.PLACE)) {
                loc = DAO.geoNames.analyse(this.place_name, null, 5, Integer.toString(this.text.hashCode()));
                this.place_context = PlaceContext.FROM;
                this.location_source = LocationSource.PLACE;
            }
            if (loc == null) {
                loc = DAO.geoNames.analyse(this.text, this.hashtags, 5, Integer.toString(this.text.hashCode()));
                this.place_context = PlaceContext.ABOUT;
                this.location_source = LocationSource.ANNOTATION;
            }
            if (loc != null) {
                if (this.place_name == null || this.place_name.length() == 0)
                    this.place_name = loc.getNames().iterator().next();
                this.location_radius = 0;
                this.location_point = new double[] { loc.lon(), loc.lat() }; //[longitude, latitude]
                this.location_mark = new double[] { loc.mlon(), loc.mlat() }; //[longitude, latitude]
                this.place_country = loc.getISO3166cc();
            }
        }
        this.enriched = true;
    }

    private static List<String> extract(StringBuilder s, Pattern p, int g) {
        Matcher m = p.matcher(s.toString());
        List<String> l = new ArrayList<String>();
        while (m.find())
            l.add(m.group(g));
        for (String r : l) {
            int i = s.indexOf(r);
            s.replace(i, i + r.length(), "");
        }
        return l;
    }

    @Override
    public JSONObject toJSON() {
        return toJSON(null, true, Integer.MAX_VALUE, ""); // very important to include calculated data here because that is written into the index using the abstract index factory
    }

    public JSONObject toJSON(final UserEntry user, final boolean calculatedData, final int iflinkexceedslength,
            final String urlstub) {
        JSONObject m = new JSONObject(true);

        // tweet data
        m.put(AbstractObjectEntry.TIMESTAMP_FIELDNAME, utcFormatter.print(getTimestamp().getTime()));
        m.put("created_at", utcFormatter.print(getCreatedAt().getTime()));
        if (this.on != null)
            m.put("on", utcFormatter.print(this.on.getTime()));
        if (this.to != null)
            m.put("to", utcFormatter.print(this.to.getTime()));
        m.put("screen_name", this.screen_name);
        if (this.retweet_from != null && this.retweet_from.length() > 0)
            m.put("retweet_from", this.retweet_from);
        m.put("text", this.getText(iflinkexceedslength, urlstub)); // the tweet; the cleanup is a helper function which cleans mistakes from the past in scraping
        if (this.status_id_url != null)
            m.put("link", this.status_id_url.toExternalForm());
        m.put("id_str", this.id_str);
        if (this.canonical_id != null)
            m.put("canonical_id", this.canonical_id);
        if (this.parent != null)
            m.put("parent", this.parent);
        m.put("source_type", this.source_type.toString());
        m.put("provider_type", this.provider_type.name());
        if (this.provider_hash != null && this.provider_hash.length() > 0)
            m.put("provider_hash", this.provider_hash);
        m.put("retweet_count", this.retweet_count);
        m.put("favourites_count", this.favourites_count); // there is a slight inconsistency here in the plural naming but thats how it is noted in the twitter api
        m.put("place_name", this.place_name);
        m.put("place_id", this.place_id);

        // add statistic/calculated data
        if (calculatedData) {

            // location data
            if (this.place_context != null)
                m.put("place_context", this.place_context.name());
            if (this.place_country != null && this.place_country.length() == 2) {
                m.put("place_country", DAO.geoNames.getCountryName(this.place_country));
                m.put("place_country_code", this.place_country);
                m.put("place_country_center", DAO.geoNames.getCountryCenter(this.place_country));
            }

            // add optional location data. This is written even if calculatedData == false if the source is from REPORT to prevent that it is lost
            if (this.location_point != null && this.location_point.length == 2 && this.location_mark != null
                    && this.location_mark.length == 2) {
                // reference for this format: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-geo-point-type.html#_lat_lon_as_array_5
                m.put("location_point", this.location_point); // [longitude, latitude]
                m.put("location_radius", this.location_radius);
                m.put("location_mark", this.location_mark);
                m.put("location_source", this.location_source.name());
            }

            // redundant data for enhanced navigation with aggregations
            m.put("hosts", this.hosts);
            m.put("hosts_count", this.hosts.length);
            m.put("links", this.links);
            m.put("links_count", this.links.length);
            m.put("images", this.images);
            m.put("images_count", this.images.size());
            m.put("audio", this.audio);
            m.put("audio_count", this.audio.size());
            m.put("videos", this.videos);
            m.put("videos_count", this.videos.size());
            m.put("mentions", this.mentions);
            m.put("mentions_count", this.mentions.length);
            m.put("hashtags", this.hashtags);
            m.put("hashtags_count", this.hashtags.length);

            // text classifier
            if (this.classifier != null) {
                for (Map.Entry<Context, Classification<String, Category>> c : this.classifier.entrySet()) {
                    assert c.getValue() != null;
                    if (c.getValue().getCategory() == Classifier.Category.NONE)
                        continue; // we don't store non-existing classifications
                    m.put("classifier_" + c.getKey().name(), c.getValue().getCategory());
                    m.put("classifier_" + c.getKey().name() + "_probability",
                            c.getValue().getProbability() == Float.POSITIVE_INFINITY ? Float.MAX_VALUE
                                    : c.getValue().getProbability());
                }
            }

            // experimental, for ranking
            m.put("without_l_len", this.without_l_len);
            m.put("without_lu_len", this.without_lu_len);
            m.put("without_luh_len", this.without_luh_len);
        }

        // add user
        if (user != null)
            m.put("user", user.toJSON());
        return m;
    }

    public static String html2utf8(String s) {
        int p, q;
        // hex coding &#
        try {
            while ((p = s.indexOf("&#")) >= 0) {
                q = s.indexOf(';', p + 2);
                if (q < p)
                    break;
                String charcode = s.substring(p + 2, q);
                int unicode = s.charAt(0) == 'x' ? Integer.parseInt(charcode.substring(1), 16)
                        : Integer.parseInt(charcode);
                s = s.substring(0, p) + ((unicode == 10 || unicode == 13) ? "\n" : ((char) unicode))
                        + s.substring(q + 1);
            }
        } catch (Throwable e) {
            Log.getLog().warn(e);
        }
        // octal coding \\u
        try {
            while ((p = s.indexOf("\\u")) >= 0 && s.length() >= p + 6) {
                char r = ((char) Integer.parseInt(s.substring(p + 2, p + 6), 8));
                if (r < ' ')
                    r = ' ';
                s = s.substring(0, p) + r + s.substring(p + 6);
            }
        } catch (Throwable e) {
            Log.getLog().warn(e);
        }
        // remove tags
        s = A_END_TAG.matcher(s).replaceAll("");
        s = QUOT_TAG.matcher(s).replaceAll("\"");
        s = AMP_TAG.matcher(s).replaceAll("&");
        // remove funny symbols
        StringBuilder clean = new StringBuilder(s.length() + 5);
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (((int) c) == 8232 || c == '\n' || c == '\r')
                clean.append("\n");
            else if (c < ' ')
                clean.append(' ');
            else
                clean.append(c);
        }
        // remove double spaces
        return clean.toString().replaceAll("  ", " ");
    }

    private final static Pattern A_END_TAG = Pattern.compile("</a>");
    private final static Pattern QUOT_TAG = Pattern.compile("&quot;");
    private final static Pattern AMP_TAG = Pattern.compile("&amp;");
}