Java tutorial
/** * MessageEntry * Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; wo even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.objects; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.eclipse.jetty.util.log.Log; import org.json.JSONObject; import org.loklak.api.search.ShortlinkFromTweetServlet; import org.loklak.data.Classifier; import org.loklak.data.DAO; import org.loklak.data.Classifier.Category; import org.loklak.data.Classifier.Context; import org.loklak.geo.GeoMark; import org.loklak.geo.LocationSource; import org.loklak.objects.QueryEntry.PlaceContext; import org.loklak.tools.bayes.Classification; public class MessageEntry extends AbstractObjectEntry implements ObjectEntry { public static final String RICH_TEXT_SEPARATOR = "\n***\n"; protected Date timestamp, created_at, on, to; // created_at will allways be set, on means 'valid from' and 'to' means 'valid_until' and may not be set protected SourceType source_type; // where did the message come from protected ProviderType provider_type; // who created the message protected String provider_hash, screen_name, retweet_from, id_str, canonical_id, parent, text; protected URL status_id_url; protected long retweet_count, favourites_count; protected LinkedHashSet<String> images, audio, videos; protected String place_name, place_id; // the following fields are either set as a common field or generated by extraction from field 'text' or from field 'place_name' protected double[] location_point, location_mark; // coordinate order is [longitude, latitude] protected int location_radius; // meter protected LocationSource location_source; protected PlaceContext place_context; protected String place_country; private boolean enriched; // the following can be computed from the tweet data but is stored in the search index to provide statistical data and ranking attributes private int without_l_len, without_lu_len, without_luh_len; // the length of tweets without links, users, hashtags private String[] hosts, links, mentions, hashtags; // the arrays of links, users, hashtags private Map<Context, Classification<String, Category>> classifier; public MessageEntry() throws MalformedURLException { this.timestamp = new Date(); this.created_at = new Date(); this.on = null; this.to = null; this.source_type = SourceType.GENERIC; this.provider_type = ProviderType.NOONE; this.provider_hash = ""; this.screen_name = ""; this.retweet_from = ""; this.id_str = ""; this.canonical_id = ""; this.parent = ""; this.text = ""; this.status_id_url = null; this.retweet_count = 0; this.favourites_count = 0; this.images = new LinkedHashSet<String>(); this.audio = new LinkedHashSet<String>(); this.videos = new LinkedHashSet<String>(); this.place_id = ""; this.place_name = ""; this.place_context = null; this.place_country = null; this.location_point = null; this.location_radius = 0; this.location_mark = null; this.location_source = null; this.without_l_len = 0; this.without_lu_len = 0; this.without_luh_len = 0; this.hosts = new String[0]; this.links = new String[0]; this.mentions = new String[0]; this.hashtags = new String[0]; this.classifier = null; this.enriched = false; } public MessageEntry(JSONObject json) { Object timestamp_obj = lazyGet(json, AbstractObjectEntry.TIMESTAMP_FIELDNAME); this.timestamp = parseDate(timestamp_obj); Object created_at_obj = lazyGet(json, "created_at"); this.created_at = parseDate(created_at_obj); Object on_obj = lazyGet(json, "on"); this.on = on_obj == null ? null : parseDate(on); Object to_obj = lazyGet(json, "to"); this.to = to_obj == null ? null : parseDate(to); String source_type_string = (String) lazyGet(json, "source_type"); try { this.source_type = source_type_string == null ? SourceType.GENERIC : SourceType.byName(source_type_string); } catch (IllegalArgumentException e) { this.source_type = SourceType.GENERIC; } String provider_type_string = (String) lazyGet(json, "provider_type"); if (provider_type_string == null) provider_type_string = ProviderType.NOONE.name(); try { this.provider_type = ProviderType.valueOf(provider_type_string); } catch (IllegalArgumentException e) { this.provider_type = ProviderType.NOONE; } this.provider_hash = (String) lazyGet(json, "provider_hash"); this.screen_name = (String) lazyGet(json, "screen_name"); this.retweet_from = (String) lazyGet(json, "retweet_from"); this.id_str = (String) lazyGet(json, "id_str"); this.text = (String) lazyGet(json, "text"); try { this.status_id_url = new URL((String) lazyGet(json, "link")); } catch (MalformedURLException e) { this.status_id_url = null; } this.retweet_count = parseLong((Number) lazyGet(json, "retweet_count")); this.favourites_count = parseLong((Number) lazyGet(json, "favourites_count")); this.images = parseArrayList(lazyGet(json, "images")); this.audio = parseArrayList(lazyGet(json, "audio")); this.videos = parseArrayList(lazyGet(json, "videos")); this.place_id = parseString((String) lazyGet(json, "place_id")); this.place_name = parseString((String) lazyGet(json, "place_name")); this.place_country = parseString((String) lazyGet(json, "place_country")); if (this.place_country != null && this.place_country.length() != 2) this.place_country = null; // optional location Object location_point_obj = lazyGet(json, "location_point"); Object location_radius_obj = lazyGet(json, "location_radius"); Object location_mark_obj = lazyGet(json, "location_mark"); Object location_source_obj = lazyGet(json, "location_source"); if (location_point_obj == null || location_mark_obj == null || !(location_point_obj instanceof List<?>) || !(location_mark_obj instanceof List<?>)) { this.location_point = null; this.location_radius = 0; this.location_mark = null; this.location_source = null; } else { this.location_point = new double[] { (Double) ((List<?>) location_point_obj).get(0), (Double) ((List<?>) location_point_obj).get(1) }; this.location_radius = (int) parseLong((Number) location_radius_obj); this.location_mark = new double[] { (Double) ((List<?>) location_mark_obj).get(0), (Double) ((List<?>) location_mark_obj).get(1) }; this.location_source = LocationSource.valueOf((String) location_source_obj); } this.enriched = false; // load enriched data enrich(); } public Date getTimestamp() { return this.timestamp == null ? new Date() : this.timestamp; } public Date getCreatedAt() { return this.created_at == null ? new Date() : this.created_at; } public void setCreatedAt(Date created_at) { this.created_at = created_at; } public Date getOn() { return this.on; } public void setOn(Date on) { this.on = on; } public Date getTo() { return this.to; } public void setTo(Date to) { this.to = to; } public SourceType getSourceType() { return this.source_type; } public void setSourceType(SourceType source_type) { this.source_type = source_type; } public ProviderType getProviderType() { return provider_type; } public void setProviderType(ProviderType provider_type) { this.provider_type = provider_type; } public String getProviderHash() { return provider_hash; } public void setProviderHash(String provider_hash) { this.provider_hash = provider_hash; } public String getScreenName() { return screen_name; } public void setScreenName(String user_screen_name) { this.screen_name = user_screen_name; } public String getRetweetFrom() { return this.retweet_from; } public void setRetweetFrom(String retweet_from) { this.retweet_from = retweet_from; } public String getIdStr() { return id_str; } public void setIdStr(String id_str) { this.id_str = id_str; } public URL getStatusIdUrl() { return this.status_id_url; } public void setStatusIdUrl(URL status_id_url) { this.status_id_url = status_id_url; } public long getRetweetCount() { return retweet_count; } public void setRetweetCount(long retweet_count) { this.retweet_count = retweet_count; } public long getFavouritesCount() { return this.favourites_count; } public void setFavouritesCount(long favourites_count) { this.favourites_count = favourites_count; } public String getPlaceName() { return place_name; } public PlaceContext getPlaceContext() { return place_context; } public void setPlaceName(String place_name, PlaceContext place_context) { this.place_name = place_name; this.place_context = place_context; } public String getPlaceId() { return place_id; } public void setPlaceId(String place_id) { this.place_id = place_id; } /** * @return [longitude, latitude] */ public double[] getLocationPoint() { return location_point; } /** * set the location * @param location_point in the form [longitude, latitude] */ public void setLocationPoint(double[] location_point) { this.location_point = location_point; } /** * @return [longitude, latitude] which is inside of getLocationRadius() from getLocationPoint() */ public double[] getLocationMark() { return location_mark; } /** * set the location * @param location_point in the form [longitude, latitude] */ public void setLocationMark(double[] location_mark) { this.location_mark = location_mark; } /** * get the radius in meter * @return radius in meter around getLocationPoint() (NOT getLocationMark()) */ public int getLocationRadius() { return location_radius; } public void setLocationRadius(int location_radius) { this.location_radius = location_radius; } public LocationSource getLocationSource() { return location_source; } public void setLocationSource(LocationSource location_source) { this.location_source = location_source; } public void setText(String text) { this.text = text; } public void setImages(ArrayList<String> images) { this.images = parseArrayList(images); } public void setImages(String[] images) { this.images = parseArrayList(images); } public void setImages(String image) { this.images = parseArrayList(image); } public long getId() { return Long.parseLong(this.id_str); } public String[] getHosts() { return this.hosts; } public String getText(final int iflinkexceedslength, final String urlstub) { // check if we shall replace shortlinks String t = this.text; String[] links = this.getLinks(); if (links != null) { linkloop: for (int nth = 0; nth < links.length; nth++) { String link = links[nth]; if (link.length() > iflinkexceedslength) { if (!DAO.existMessage(this.getIdStr())) break linkloop; t = t.replace(link, urlstub + "/x?id=" + this.getIdStr() + (nth == 0 ? "" : ShortlinkFromTweetServlet.SHORTLINK_COUNTER_SEPERATOR + Integer.toString(nth))); } } } return t; } public String[] getMentions() { return this.mentions; } public String[] getHashtags() { return this.hashtags; } public String[] getLinks() { return this.links; } public Collection<String> getImages() { return this.images; } public Classifier.Category getClassifier(Classifier.Context context) { if (this.classifier == null) return null; Classification<String, Category> classification = this.classifier.get(context); if (classification == null) return null; return classification.getCategory() == Classifier.Category.NONE ? null : classification.getCategory(); } public double getClassifierProbability(Classifier.Context context) { if (this.classifier == null) return 0.0d; Classification<String, Category> classification = this.classifier.get(context); if (classification == null) return 0.0d; return classification.getProbability(); } final static Pattern SPACEX_PATTERN = Pattern.compile(" +"); // two or more final static Pattern URL_PATTERN = Pattern .compile("(?:\\b|^)(https?://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|])"); // right boundary must be space or ) since others may appear in urls final static Pattern USER_PATTERN = Pattern.compile("(?:[ (]|^)(@..*?)(?:\\b|$)"); // left boundary must be space since the @ is itself a boundary final static Pattern HASHTAG_PATTERN = Pattern.compile("(?:[ (]|^)(#..*?)(?:\\b|$)"); // left boundary must be a space since the # is itself a boundary /** * create enriched data, useful for analytics and ranking: * - identify all mentioned users, hashtags and links * - count message size without links * - count message size without links and without users */ public void enrich() { if (this.enriched) return; StringBuilder t = new StringBuilder(this.text); // extract the links List<String> links = extract(t, URL_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_l_len = t.length(); // len_no_l // extract the users List<String> users = extract(t, USER_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_lu_len = t.length(); // len_no_l_and_users // extract the hashtags List<String> hashtags = extract(t, HASHTAG_PATTERN, 1); t = new StringBuilder(SPACEX_PATTERN.matcher(t).replaceAll(" ").trim()); this.without_luh_len = t.length(); // len_no_l_and_users_and_hashtags // extract the hosts from the links Set<String> hosts = new LinkedHashSet<String>(); for (String u : links) { try { URL url = new URL(u); hosts.add(url.getHost()); } catch (MalformedURLException e) { } } this.hosts = new String[hosts.size()]; int j = 0; for (String host : hosts) this.hosts[j++] = host.toLowerCase(); this.mentions = new String[users.size()]; for (int i = 0; i < users.size(); i++) this.mentions[i] = users.get(i).substring(1); this.hashtags = new String[hashtags.size()]; for (int i = 0; i < hashtags.size(); i++) this.hashtags[i] = hashtags.get(i).substring(1).toLowerCase(); this.links = new String[links.size()]; for (int i = 0; i < links.size(); i++) this.links[i] = links.get(i); // classify content this.classifier = Classifier.classify(this.text); // more media data, analyze the links for (String link : this.links) { if (link.endsWith(".mp4") || link.endsWith(".m4v") || link.indexOf("vimeo.com") > 0 || link.indexOf("youtube.com") > 0 || link.indexOf("youtu.be") > 0 || link.indexOf("vine.co") > 0 || link.indexOf("ted.com") > 0) { this.videos.add(link); continue; } if (link.endsWith(".mp3") || link.indexOf("soundcloud.com") > 0) { this.audio.add(link); continue; } if (link.endsWith(".jpg") || link.endsWith(".jpeg") || link.endsWith(".png") || link.endsWith(".gif") || link.indexOf("flickr.com") > 0 || link.indexOf("instagram.com") > 0 || link.indexOf("imgur.com") > 0 || link.indexOf("giphy.com") > 0 || link.indexOf("pic.twitter.com") > 0) { this.images.add(link); continue; } } // find location if ((this.location_point == null || this.location_point.length == 0) && DAO.geoNames != null) { GeoMark loc = null; if (this.place_name != null && this.place_name.length() > 0 && (this.location_source == null || this.location_source == LocationSource.ANNOTATION || this.location_source == LocationSource.PLACE)) { loc = DAO.geoNames.analyse(this.place_name, null, 5, Integer.toString(this.text.hashCode())); this.place_context = PlaceContext.FROM; this.location_source = LocationSource.PLACE; } if (loc == null) { loc = DAO.geoNames.analyse(this.text, this.hashtags, 5, Integer.toString(this.text.hashCode())); this.place_context = PlaceContext.ABOUT; this.location_source = LocationSource.ANNOTATION; } if (loc != null) { if (this.place_name == null || this.place_name.length() == 0) this.place_name = loc.getNames().iterator().next(); this.location_radius = 0; this.location_point = new double[] { loc.lon(), loc.lat() }; //[longitude, latitude] this.location_mark = new double[] { loc.mlon(), loc.mlat() }; //[longitude, latitude] this.place_country = loc.getISO3166cc(); } } this.enriched = true; } private static List<String> extract(StringBuilder s, Pattern p, int g) { Matcher m = p.matcher(s.toString()); List<String> l = new ArrayList<String>(); while (m.find()) l.add(m.group(g)); for (String r : l) { int i = s.indexOf(r); s.replace(i, i + r.length(), ""); } return l; } @Override public JSONObject toJSON() { return toJSON(null, true, Integer.MAX_VALUE, ""); // very important to include calculated data here because that is written into the index using the abstract index factory } public JSONObject toJSON(final UserEntry user, final boolean calculatedData, final int iflinkexceedslength, final String urlstub) { JSONObject m = new JSONObject(true); // tweet data m.put(AbstractObjectEntry.TIMESTAMP_FIELDNAME, utcFormatter.print(getTimestamp().getTime())); m.put("created_at", utcFormatter.print(getCreatedAt().getTime())); if (this.on != null) m.put("on", utcFormatter.print(this.on.getTime())); if (this.to != null) m.put("to", utcFormatter.print(this.to.getTime())); m.put("screen_name", this.screen_name); if (this.retweet_from != null && this.retweet_from.length() > 0) m.put("retweet_from", this.retweet_from); m.put("text", this.getText(iflinkexceedslength, urlstub)); // the tweet; the cleanup is a helper function which cleans mistakes from the past in scraping if (this.status_id_url != null) m.put("link", this.status_id_url.toExternalForm()); m.put("id_str", this.id_str); if (this.canonical_id != null) m.put("canonical_id", this.canonical_id); if (this.parent != null) m.put("parent", this.parent); m.put("source_type", this.source_type.toString()); m.put("provider_type", this.provider_type.name()); if (this.provider_hash != null && this.provider_hash.length() > 0) m.put("provider_hash", this.provider_hash); m.put("retweet_count", this.retweet_count); m.put("favourites_count", this.favourites_count); // there is a slight inconsistency here in the plural naming but thats how it is noted in the twitter api m.put("place_name", this.place_name); m.put("place_id", this.place_id); // add statistic/calculated data if (calculatedData) { // location data if (this.place_context != null) m.put("place_context", this.place_context.name()); if (this.place_country != null && this.place_country.length() == 2) { m.put("place_country", DAO.geoNames.getCountryName(this.place_country)); m.put("place_country_code", this.place_country); m.put("place_country_center", DAO.geoNames.getCountryCenter(this.place_country)); } // add optional location data. This is written even if calculatedData == false if the source is from REPORT to prevent that it is lost if (this.location_point != null && this.location_point.length == 2 && this.location_mark != null && this.location_mark.length == 2) { // reference for this format: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-geo-point-type.html#_lat_lon_as_array_5 m.put("location_point", this.location_point); // [longitude, latitude] m.put("location_radius", this.location_radius); m.put("location_mark", this.location_mark); m.put("location_source", this.location_source.name()); } // redundant data for enhanced navigation with aggregations m.put("hosts", this.hosts); m.put("hosts_count", this.hosts.length); m.put("links", this.links); m.put("links_count", this.links.length); m.put("images", this.images); m.put("images_count", this.images.size()); m.put("audio", this.audio); m.put("audio_count", this.audio.size()); m.put("videos", this.videos); m.put("videos_count", this.videos.size()); m.put("mentions", this.mentions); m.put("mentions_count", this.mentions.length); m.put("hashtags", this.hashtags); m.put("hashtags_count", this.hashtags.length); // text classifier if (this.classifier != null) { for (Map.Entry<Context, Classification<String, Category>> c : this.classifier.entrySet()) { assert c.getValue() != null; if (c.getValue().getCategory() == Classifier.Category.NONE) continue; // we don't store non-existing classifications m.put("classifier_" + c.getKey().name(), c.getValue().getCategory()); m.put("classifier_" + c.getKey().name() + "_probability", c.getValue().getProbability() == Float.POSITIVE_INFINITY ? Float.MAX_VALUE : c.getValue().getProbability()); } } // experimental, for ranking m.put("without_l_len", this.without_l_len); m.put("without_lu_len", this.without_lu_len); m.put("without_luh_len", this.without_luh_len); } // add user if (user != null) m.put("user", user.toJSON()); return m; } public static String html2utf8(String s) { int p, q; // hex coding &# try { while ((p = s.indexOf("&#")) >= 0) { q = s.indexOf(';', p + 2); if (q < p) break; String charcode = s.substring(p + 2, q); int unicode = s.charAt(0) == 'x' ? Integer.parseInt(charcode.substring(1), 16) : Integer.parseInt(charcode); s = s.substring(0, p) + ((unicode == 10 || unicode == 13) ? "\n" : ((char) unicode)) + s.substring(q + 1); } } catch (Throwable e) { Log.getLog().warn(e); } // octal coding \\u try { while ((p = s.indexOf("\\u")) >= 0 && s.length() >= p + 6) { char r = ((char) Integer.parseInt(s.substring(p + 2, p + 6), 8)); if (r < ' ') r = ' '; s = s.substring(0, p) + r + s.substring(p + 6); } } catch (Throwable e) { Log.getLog().warn(e); } // remove tags s = A_END_TAG.matcher(s).replaceAll(""); s = QUOT_TAG.matcher(s).replaceAll("\""); s = AMP_TAG.matcher(s).replaceAll("&"); // remove funny symbols StringBuilder clean = new StringBuilder(s.length() + 5); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (((int) c) == 8232 || c == '\n' || c == '\r') clean.append("\n"); else if (c < ' ') clean.append(' '); else clean.append(c); } // remove double spaces return clean.toString().replaceAll(" ", " "); } private final static Pattern A_END_TAG = Pattern.compile("</a>"); private final static Pattern QUOT_TAG = Pattern.compile("""); private final static Pattern AMP_TAG = Pattern.compile("&"); }