Java tutorial
/** * MessageEntry * Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r * This class is the android version from the original file, * taken from the loklak_server project. It may be slightly different. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; wo even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.android.data; import org.json.JSONException; import org.json.JSONObject; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MessageEntry extends AbstractIndexEntry { public static final String RICH_TEXT_SEPARATOR = "\n***\n"; protected Date created_at, on, to; // created_at will allways be set, on means 'valid from' and 'to' means 'valid_until' and may not be set protected String provider_hash, screen_name, retweet_from, id_str, canonical_id, parent, text; protected URL status_id_url; protected long retweet_count, favourites_count; protected LinkedHashSet<String> images, audio, videos; protected String place_name, place_id; // the following fields are either set as a common field or generated by extraction from field 'text' or from field 'place_name' protected double[] location_point, location_mark; // coordinate order is [longitude, latitude] protected int location_radius; // meter protected String place_country; // the following can be computed from the tweet data but is stored in the search index to provide statistical data and ranking attributes private int without_l_len, without_lu_len, without_luh_len; // the length of tweets without links, users, hashtags private String[] hosts, links, mentions, hashtags; // the arrays of links, users, hashtags public MessageEntry() throws MalformedURLException { this.created_at = new Date(); this.on = null; this.to = null; this.provider_hash = ""; this.screen_name = ""; this.retweet_from = ""; this.id_str = ""; this.canonical_id = ""; this.parent = ""; this.text = ""; this.status_id_url = null; this.retweet_count = 0; this.favourites_count = 0; this.images = new LinkedHashSet<String>(); this.audio = new LinkedHashSet<String>(); this.videos = new LinkedHashSet<String>(); this.place_id = ""; this.place_name = ""; this.place_country = null; this.location_point = null; this.location_radius = 0; this.location_mark = null; this.without_l_len = 0; this.without_lu_len = 0; this.without_luh_len = 0; this.hosts = new String[0]; this.links = new String[0]; this.mentions = new String[0]; this.hashtags = new String[0]; } public MessageEntry(JSONObject json) { Object created_at_obj = lazyGet(json, "created_at"); this.created_at = parseDate(created_at_obj); Object on_obj = lazyGet(json, "on"); this.on = on_obj == null ? null : parseDate(on); Object to_obj = lazyGet(json, "to"); this.to = to_obj == null ? null : parseDate(to); String source_type_string = (String) lazyGet(json, "source_type"); String provider_type_string = (String) lazyGet(json, "provider_type"); this.provider_hash = (String) lazyGet(json, "provider_hash"); this.screen_name = (String) lazyGet(json, "screen_name"); this.retweet_from = (String) lazyGet(json, "retweet_from"); this.id_str = (String) lazyGet(json, "id_str"); this.text = (String) lazyGet(json, "text"); try { this.status_id_url = new URL((String) lazyGet(json, "link")); } catch (MalformedURLException e) { this.status_id_url = null; } this.retweet_count = parseLong((Number) lazyGet(json, "retweet_count")); this.favourites_count = parseLong((Number) lazyGet(json, "favourites_count")); this.images = parseArrayList(lazyGet(json, "images")); this.audio = parseArrayList(lazyGet(json, "audio")); this.videos = parseArrayList(lazyGet(json, "videos")); this.place_id = parseString((String) lazyGet(json, "place_id")); this.place_name = parseString((String) lazyGet(json, "place_name")); this.place_country = parseString((String) lazyGet(json, "place_country")); if (this.place_country != null && this.place_country.length() != 2) this.place_country = null; // optional location Object location_point_obj = lazyGet(json, "location_point"); Object location_radius_obj = lazyGet(json, "location_radius"); Object location_mark_obj = lazyGet(json, "location_mark"); Object location_source_obj = lazyGet(json, "location_source"); if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>) || !(location_mark_obj instanceof List<?>)) { this.location_point = null; this.location_radius = 0; this.location_mark = null; } else { this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0), (Double) ((List<?>) location_point_obj).get(1) }; this.location_radius = (int) parseLong((Number) location_radius_obj); this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0), (Double) ((List<?>) location_mark_obj).get(1) }; } // load enriched data enrich(); } public MessageEntry(Map<String, Object> map) { Object created_at_obj = map.get("created_at"); this.created_at = parseDate(created_at_obj); Object on_obj = map.get("on"); this.on = on_obj == null ? null : parseDate(on); Object to_obj = map.get("to"); this.to = to_obj == null ? null : parseDate(to); String source_type_string = (String) map.get("source_type"); String provider_type_string = (String) map.get("provider_type"); this.provider_hash = (String) map.get("provider_hash"); this.screen_name = (String) map.get("screen_name"); this.retweet_from = (String) map.get("retweet_from"); this.id_str = (String) map.get("id_str"); this.text = (String) map.get("text"); try { this.status_id_url = new URL((String) map.get("link")); } catch (MalformedURLException e) { this.status_id_url = null; } this.retweet_count = parseLong((Number) map.get("retweet_count")); this.favourites_count = parseLong((Number) map.get("favourites_count")); this.images = parseArrayList(map.get("images")); this.audio = parseArrayList(map.get("audio")); this.videos = parseArrayList(map.get("videos")); this.place_id = parseString((String) map.get("place_id")); this.place_name = parseString((String) map.get("place_name")); this.place_country = parseString((String) map.get("place_country")); if (this.place_country != null && this.place_country.length() != 2) this.place_country = null; // optional location Object location_point_obj = map.get("location_point"); Object location_radius_obj = map.get("location_radius"); Object location_mark_obj = map.get("location_mark"); Object location_source_obj = map.get("location_source"); if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>) || !(location_mark_obj instanceof List<?>)) { this.location_point = null; this.location_radius = 0; this.location_mark = null; } else { this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0), (Double) ((List<?>) location_point_obj).get(1) }; this.location_radius = (int) parseLong((Number) location_radius_obj); this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0), (Double) ((List<?>) location_mark_obj).get(1) }; } // load enriched data enrich(); } public Date getCreatedAt() { return this.created_at == null ? new Date() : this.created_at; } public void setCreatedAt(Date created_at) { this.created_at = created_at; } public Date getOn() { return this.on; } public void setOn(Date on) { this.on = on; } public Date getTo() { return this.to; } public void setTo(Date to) { this.to = to; } public String getProviderHash() { return provider_hash; } public void setProviderHash(String provider_hash) { this.provider_hash = provider_hash; } public String getScreenName() { return screen_name; } public void setScreenName(String user_screen_name) { this.screen_name = user_screen_name; } public String getRetweetFrom() { return this.retweet_from; } public void setRetweetFrom(String retweet_from) { this.retweet_from = retweet_from; } public String getIdStr() { return id_str; } public void setIdStr(String id_str) { this.id_str = id_str; } public URL getStatusIdUrl() { return this.status_id_url; } public void setStatusIdUrl(URL status_id_url) { this.status_id_url = status_id_url; } public long getRetweetCount() { return retweet_count; } public void setRetweetCount(long retweet_count) { this.retweet_count = retweet_count; } public long getFavouritesCount() { return this.favourites_count; } public void setFavouritesCount(long favourites_count) { this.favourites_count = favourites_count; } public String getPlaceName() { return place_name; } public String getPlaceId() { return place_id; } public void setPlaceId(String place_id) { this.place_id = place_id; } /** * @return [longitude, latitude] */ public double[] getLocationPoint() { return location_point; } /** * set the location * @param location_point in the form [longitude, latitude] */ public void setLocationPoint(double[] location_point) { this.location_point = location_point; } /** * @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint() */ public double[] getLocationMark() { return location_mark; } /** * set the location * @param location_mark in the form [longitude, latitude] */ public void setLocationMark(double[] location_mark) { this.location_mark = location_mark; } /** * get the radius in meter * @return radius in meter around getLocationPoint() (NOT getLocationMark()) */ public int getLocationRadius() { return location_radius; } public void setLocationRadius(int location_radius) { this.location_radius = location_radius; } public void setText(String text) { this.text = text; } public void setImages(ArrayList<String> images) { this.images = parseArrayList(images); } public void setImages(String[] images) { this.images = parseArrayList(images); } public void setImages(String image) { this.images = parseArrayList(image); } public long getId() { return Long.parseLong(this.id_str); } public String[] getHosts() { return this.hosts; } public String getText(final int iflinkexceedslength, final String urlstub) { return this.text; } public String[] getMentions() { return this.mentions; } public String[] getHashtags() { return this.hashtags; } public String[] getLinks() { return this.links; } public Collection<String> getImages() { return this.images; } final static Pattern SPACEX_PATTERN = Pattern.compile(" +"); // two or more final static Pattern URL_PATTERN = Pattern.compile("(?:\\b|^)(https?://.*?)(?:[) ]|$)"); // right boundary must be space since others may appear in urls final static Pattern USER_PATTERN = Pattern.compile("(?:[ (]|^)(@..*?)(?:\\b|$)"); // left boundary must be space since the @ is itself a boundary final static Pattern HASHTAG_PATTERN = Pattern.compile("(?:[ (]|^)(#..*?)(?:\\b|$)"); // left boundary must be a space since the # is itself a boundary /** * create enriched data, useful for analytics and ranking: * - identify all mentioned users, hashtags and links * - count message size without links * - count message size without links and without users */ public void enrich() { StringBuilder t = new StringBuilder(this.text); // extract the links List<String> links = extract(t, URL_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_l_len = t.length(); // len_no_l // extract the users List<String> users = extract(t, USER_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_lu_len = t.length(); // len_no_l_and_users // extract the hashtags List<String> hashtags = extract(t, HASHTAG_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_luh_len = t.length(); // len_no_l_and_users_and_hashtags // extract the hosts from the links Set<String> hosts = new LinkedHashSet<String>(); for (String u : links) { try { URL url = new URL(u); hosts.add(url.getHost()); } catch (MalformedURLException e) { } } this.hosts = new String[hosts.size()]; int j = 0; for (String host : hosts) this.hosts[j++] = host.toLowerCase(); this.mentions = new String[users.size()]; for (int i = 0; i < users.size(); i++) this.mentions[i] = users.get(i).substring(1); this.hashtags = new String[hashtags.size()]; for (int i = 0; i < hashtags.size(); i++) this.hashtags[i] = hashtags.get(i).substring(1).toLowerCase(); this.links = new String[links.size()]; for (int i = 0; i < links.size(); i++) this.links[i] = links.get(i); // more media data, analyze the links for (String link : this.links) { if (link.endsWith(".mp4") || link.endsWith(".m4v") || link.indexOf("vimeo.com") > 0 || link.indexOf("youtube.com") > 0 || link.indexOf("youtu.be") > 0 || link.indexOf("vine.co") > 0 || link.indexOf("ted.com") > 0) { this.videos.add(link); continue; } if (link.endsWith(".mp3") || link.indexOf("soundcloud.com") > 0) { this.audio.add(link); continue; } if (link.indexOf("flickr.com") > 0 || link.indexOf("instagram.com") > 0 || link.indexOf("imgur.com") > 0 || link.indexOf("giphy.com") > 0) { this.images.add(link); continue; } } } private static List<String> extract(StringBuilder s, Pattern p, int g) { Matcher m = p.matcher(s.toString()); List<String> l = new ArrayList<String>(); while (m.find()) l.add(m.group(g)); for (String r : l) { int i = s.indexOf(r); s.replace(i, i + r.length(), ""); } return l; } public JSONObject toJSON(final UserEntry user, final boolean calculatedData, final int iflinkexceedslength, final String urlstub) throws JSONException { JSONObject json = new JSONObject(); // tweet data json.put("created_at", utcFormatter.print(getCreatedAt().getTime())); if (this.on != null) json.put("on", utcFormatter.print(this.on.getTime())); if (this.to != null) json.put("to", utcFormatter.print(this.to.getTime())); json.put("screen_name", this.screen_name); if (this.retweet_from != null && this.retweet_from.length() > 0) json.put("retweet_from", this.retweet_from); json.put("text", this.getText(iflinkexceedslength, urlstub)); // the tweet; the cleanup is a helper function which cleans mistakes from the past in scraping if (this.status_id_url != null) json.put("link", this.status_id_url.toExternalForm()); json.put("id_str", this.id_str); if (this.canonical_id != null) json.put("canonical_id", this.canonical_id); if (this.parent != null) json.put("parent", this.parent); if (this.provider_hash != null && this.provider_hash.length() > 0) json.put("provider_hash", this.provider_hash); json.put("retweet_count", this.retweet_count); json.put("favourites_count", this.favourites_count); // there is a slight inconsistency here in the plural naming but thats how it is noted in the twitter api json.put("images", this.images); json.put("images_count", this.images.size()); json.put("audio", this.audio); json.put("audio_count", this.audio.size()); json.put("videos", this.videos); json.put("videos_count", this.videos.size()); json.put("place_name", this.place_name); json.put("place_id", this.place_id); // add statistic/calculated data if (calculatedData) { // location data if (this.place_country != null && this.place_country.length() == 2) { json.put("place_country_code", this.place_country); } // add optional location data. This is written even if calculatedData == false if the source is from REPORT to prevent that it is lost if (this.location_point != null && this.location_point.length == 2 && this.location_mark != null && this.location_mark.length == 2) { // reference for this format: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-geo-point-type.html#_lat_lon_as_array_5 json.put("location_point", this.location_point); // [longitude, latitude] json.put("location_radius", this.location_radius); json.put("location_mark", this.location_mark); } // redundant data for enhanced navigation with aggregations json.put("hosts", this.hosts); json.put("hosts_count", this.hosts.length); json.put("links", this.links); json.put("links_count", this.links.length); json.put("mentions", this.mentions); json.put("mentions_count", this.mentions.length); json.put("hashtags", this.hashtags); json.put("hashtags_count", this.hashtags.length); } // add user if (user != null) json.put("user", user.toJSON()); return json; } public static String html2utf8(String s) { int p, q; // hex coding &# try { while ((p = s.indexOf("&#")) >= 0) { q = s.indexOf(';', p + 2); if (q < p) break; String charcode = s.substring(p + 2, q); int unicode = s.charAt(0) == 'x' ? Integer.parseInt(charcode.substring(1), 16) : Integer.parseInt(charcode); s = s.substring(0, p) + ((unicode == 10 || unicode == 13) ? "\n" : ((char) unicode)) + s.substring(q + 1); } } catch (Throwable e) { e.printStackTrace(); } // octal coding \\u try { while ((p = s.indexOf("\\u")) >= 0 && s.length() >= p + 6) { char r = ((char) Integer.parseInt(s.substring(p + 2, p + 6), 8)); if (r < ' ') r = ' '; s = s.substring(0, p) + r + s.substring(p + 6); } } catch (Throwable e) { e.printStackTrace(); } // remove tags s = s.replaceAll("</a>", "").replaceAll(""", "\"").replaceAll("&", "&"); // remove funny symbols StringBuilder clean = new StringBuilder(s.length() + 5); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (((int) c) == 8232 || c == '\n' || c == '\r') clean.append("\n"); else if (c < ' ') clean.append(' '); else clean.append(c); } // remove double spaces return clean.toString().replaceAll(" ", " "); } }