Java tutorial
/** * DAO * Copyright 22.02.2015 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak.data; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.nio.file.Path; import java.security.KeyPair; import java.security.KeyPairGenerator; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; import org.eclipse.jetty.util.ConcurrentHashSet; import com.github.fge.jackson.JsonLoader; import com.google.common.base.Charsets; import com.google.common.io.Files; import org.eclipse.jetty.util.log.Log; import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.common.logging.ESLoggerFactory; import org.elasticsearch.common.logging.slf4j.Slf4jESLoggerFactory; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.RangeQueryBuilder; import org.elasticsearch.search.sort.SortOrder; import org.json.JSONException; import org.json.JSONObject; import org.loklak.Caretaker; import org.loklak.api.search.SearchServlet; import org.loklak.geo.GeoNames; import org.loklak.harvester.TwitterScraper; import org.loklak.http.AccessTracker; import org.loklak.http.ClientConnection; import org.loklak.http.RemoteAccess; import org.loklak.objects.AccountEntry; import org.loklak.objects.ImportProfileEntry; import org.loklak.objects.MessageEntry; import org.loklak.objects.Peers; import org.loklak.objects.QueryEntry; import org.loklak.objects.ResultList; import org.loklak.objects.SourceType; import org.loklak.objects.Timeline; import org.loklak.objects.UserEntry; import org.loklak.server.*; import org.loklak.susi.SusiMind; import org.loklak.tools.DateParser; import org.loklak.tools.OS; import org.loklak.tools.storage.*; import com.fasterxml.jackson.databind.JsonNode; /** * The Data Access Object for the message project. * This provides only static methods because the class methods shall be available for * all other classes. * * To debug, call elasticsearch directly i.e.: * * get statistics * curl localhost:9200/_stats?pretty=true * * get statistics for message index * curl -XGET 'http://127.0.0.1:9200/messages?pretty=true' * * get mappings in message index * curl -XGET "http://localhost:9200/messages/_mapping?pretty=true" * * get search result from message index * curl -XGET 'http://127.0.0.1:9200/messages/_search?q=*&pretty=true' */ public class DAO { public final static com.fasterxml.jackson.core.JsonFactory jsonFactory = new com.fasterxml.jackson.core.JsonFactory(); public final static com.fasterxml.jackson.databind.ObjectMapper jsonMapper = new com.fasterxml.jackson.databind.ObjectMapper( DAO.jsonFactory); public final static com.fasterxml.jackson.core.type.TypeReference<HashMap<String, Object>> jsonTypeRef = new com.fasterxml.jackson.core.type.TypeReference<HashMap<String, Object>>() { }; public final static String MESSAGE_DUMP_FILE_PREFIX = "messages_"; public final static String ACCOUNT_DUMP_FILE_PREFIX = "accounts_"; public final static String USER_DUMP_FILE_PREFIX = "users_"; public final static String ACCESS_DUMP_FILE_PREFIX = "access_"; public final static String FOLLOWERS_DUMP_FILE_PREFIX = "followers_"; public final static String FOLLOWING_DUMP_FILE_PREFIX = "following_"; private static final String IMPORT_PROFILE_FILE_PREFIX = "profile_"; public final static int CACHE_MAXSIZE = 10000; public final static int EXIST_MAXSIZE = 4000000; public static File conf_dir, bin_dir, html_dir; private static File external_data, assets, dictionaries; public static Settings public_settings, private_settings; private static Path message_dump_dir, account_dump_dir, import_profile_dump_dir; public static JsonRepository message_dump; private static JsonRepository account_dump; private static JsonRepository import_profile_dump; public static JsonDataset user_dump, followers_dump, following_dump; public static AccessTracker access; private static File schema_dir, conv_schema_dir; private static ElasticsearchClient elasticsearch_client; //private static Node elasticsearch_node; //private static Client elasticsearch_client; public static UserFactory users; private static AccountFactory accounts; public static MessageFactory messages; public static MessageFactory messages_hour; public static MessageFactory messages_day; public static MessageFactory messages_week; public static QueryFactory queries; private static ImportProfileFactory importProfiles; private static Map<String, String> config = new HashMap<>(); public static GeoNames geoNames = null; public static Peers peers = new Peers(); // AAA Schema for server usage public static JsonTray authentication; public static JsonTray authorization; public static JsonTray accounting; public static UserRoles userRoles; public static JsonTray passwordreset; public static Map<String, Accounting> accounting_temporary = new HashMap<>(); public static JsonFile login_keys; // built-in artificial intelligence public static SusiMind susi; public static enum IndexName { messages_hour("messages.json"), messages_day("messages.json"), messages_week( "messages.json"), messages, queries, users, accounts, import_profiles; private String schemaFileName; private IndexName() { schemaFileName = this.name() + ".json"; } private IndexName(String filename) { schemaFileName = filename; } public String getSchemaFilename() { return this.schemaFileName; } } /** * initialize the DAO * @param configMap * @param dataPath the path to the data directory */ public static void init(Map<String, String> configMap, Path dataPath) throws Exception { log("initializing loklak DAO"); config = configMap; conf_dir = new File("conf"); bin_dir = new File("bin"); html_dir = new File("html"); // wake up susi File susiinitpath = new File(conf_dir, "susi"); File sudiwatchpath = new File(new File("data"), "susi"); susi = new SusiMind(susiinitpath, sudiwatchpath); String susi_boilerplate_name = "susi_cognition_boilerplate.json"; File susi_boilerplate_file = new File(sudiwatchpath, susi_boilerplate_name); if (!susi_boilerplate_file.exists()) Files.copy(new File(conf_dir, "susi/" + susi_boilerplate_name + ".example"), susi_boilerplate_file); // initialize public and private keys public_settings = new Settings(new File("data/settings/public.settings.json")); File private_file = new File("data/settings/private.settings.json"); private_settings = new Settings(private_file); OS.protectPath(private_file.toPath()); if (!private_settings.loadPrivateKey() || !public_settings.loadPublicKey()) { log("Can't load key pair. Creating new one"); // create new key pair KeyPairGenerator keyGen; try { String algorithm = "RSA"; keyGen = KeyPairGenerator.getInstance(algorithm); keyGen.initialize(2048); KeyPair keyPair = keyGen.genKeyPair(); private_settings.setPrivateKey(keyPair.getPrivate(), algorithm); public_settings.setPublicKey(keyPair.getPublic(), algorithm); } catch (NoSuchAlgorithmException e) { throw e; } log("Key creation finished. Peer hash: " + public_settings.getPeerHashAlgorithm() + " " + public_settings.getPeerHash()); } else { log("Key pair loaded from file. Peer hash: " + public_settings.getPeerHashAlgorithm() + " " + public_settings.getPeerHash()); } File datadir = dataPath.toFile(); // check if elasticsearch shall be accessed as external cluster String transport = configMap.get("elasticsearch_transport.enabled"); if (transport != null && "true".equals(transport)) { String cluster_name = configMap.get("elasticsearch_transport.cluster.name"); String transport_addresses_string = configMap.get("elasticsearch_transport.addresses"); if (transport_addresses_string != null && transport_addresses_string.length() > 0) { String[] transport_addresses = transport_addresses_string.split(","); elasticsearch_client = new ElasticsearchClient(transport_addresses, cluster_name); } } else { // use all config attributes with a key starting with "elasticsearch." to set elasticsearch settings ESLoggerFactory.setDefaultFactory(new Slf4jESLoggerFactory()); org.elasticsearch.common.settings.Settings.Builder settings = org.elasticsearch.common.settings.Settings .builder(); for (Map.Entry<String, String> entry : config.entrySet()) { String key = entry.getKey(); if (key.startsWith("elasticsearch.")) settings.put(key.substring(14), entry.getValue()); } // patch the home path settings.put("path.home", datadir.getAbsolutePath()); settings.put("path.data", datadir.getAbsolutePath()); settings.build(); // start elasticsearch elasticsearch_client = new ElasticsearchClient(settings); } // open AAA storage Path settings_dir = dataPath.resolve("settings"); settings_dir.toFile().mkdirs(); Path authentication_path = settings_dir.resolve("authentication.json"); authentication = new JsonTray(authentication_path.toFile(), 10000); OS.protectPath(authentication_path); Path authorization_path = settings_dir.resolve("authorization.json"); authorization = new JsonTray(authorization_path.toFile(), 10000); OS.protectPath(authorization_path); Path passwordreset_path = settings_dir.resolve("passwordreset.json"); passwordreset = new JsonTray(passwordreset_path.toFile(), 10000); OS.protectPath(passwordreset_path); Path accounting_path = settings_dir.resolve("accounting.json"); accounting = new JsonTray(accounting_path.toFile(), 10000); OS.protectPath(accounting_path); Path login_keys_path = settings_dir.resolve("login-keys.json"); login_keys = new JsonFile(login_keys_path.toFile()); OS.protectPath(login_keys_path); Log.getLog().info("Initializing user roles"); Path userRoles_path = settings_dir.resolve("userRoles.json"); userRoles = new UserRoles(new JsonFile(userRoles_path.toFile())); OS.protectPath(userRoles_path); try { userRoles.loadUserRolesFromObject(); Log.getLog().info("Loaded user roles from file"); } catch (IllegalArgumentException e) { Log.getLog().info("Load default user roles"); userRoles.loadDefaultUserRoles(); } // open index Path index_dir = dataPath.resolve("index"); if (index_dir.toFile().exists()) OS.protectPath(index_dir); // no other permissions to this path // define the index factories messages = new MessageFactory(elasticsearch_client, IndexName.messages.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); messages_hour = new MessageFactory(elasticsearch_client, IndexName.messages_hour.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); messages_day = new MessageFactory(elasticsearch_client, IndexName.messages_day.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); messages_week = new MessageFactory(elasticsearch_client, IndexName.messages_week.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); users = new UserFactory(elasticsearch_client, IndexName.users.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); accounts = new AccountFactory(elasticsearch_client, IndexName.accounts.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); queries = new QueryFactory(elasticsearch_client, IndexName.queries.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); importProfiles = new ImportProfileFactory(elasticsearch_client, IndexName.import_profiles.name(), CACHE_MAXSIZE, EXIST_MAXSIZE); // create indices and set mapping (that shows how 'elastic' elasticsearch is: it's always good to define data types) File mappingsDir = new File(new File(conf_dir, "elasticsearch"), "mappings"); int shards = Integer.parseInt(configMap.get("elasticsearch.index.number_of_shards")); int replicas = Integer.parseInt(configMap.get("elasticsearch.index.number_of_replicas")); for (IndexName index : IndexName.values()) { log("initializing index '" + index.name() + "'..."); try { elasticsearch_client.createIndexIfNotExists(index.name(), shards, replicas); } catch (Throwable e) { Log.getLog().warn(e); } try { elasticsearch_client.setMapping(index.name(), new File(mappingsDir, index.getSchemaFilename())); } catch (Throwable e) { Log.getLog().warn(e); } } // elasticsearch will probably take some time until it is started up. We do some other stuff meanwhile.. // create and document the data dump dir assets = new File(datadir, "assets"); external_data = new File(datadir, "external"); dictionaries = new File(external_data, "dictionaries"); dictionaries.mkdirs(); // create message dump dir String message_dump_readme = "This directory contains dump files for messages which arrived the platform.\n" + "There are three subdirectories for dump files:\n" + "- own: for messages received with this peer. There is one file for each month.\n" + "- import: hand-over directory for message dumps to be imported. Drop dumps here and they are imported.\n" + "- imported: dump files which had been processed from the import directory are moved here.\n" + "You can import dump files from other peers by dropping them into the import directory.\n" + "Each dump file must start with the prefix '" + MESSAGE_DUMP_FILE_PREFIX + "' to be recognized.\n"; message_dump_dir = dataPath.resolve("dump"); message_dump = new JsonRepository(message_dump_dir.toFile(), MESSAGE_DUMP_FILE_PREFIX, message_dump_readme, JsonRepository.COMPRESSED_MODE, true, Runtime.getRuntime().availableProcessors()); account_dump_dir = dataPath.resolve("accounts"); account_dump_dir.toFile().mkdirs(); OS.protectPath(account_dump_dir); // no other permissions to this path account_dump = new JsonRepository(account_dump_dir.toFile(), ACCOUNT_DUMP_FILE_PREFIX, null, JsonRepository.REWRITABLE_MODE, false, Runtime.getRuntime().availableProcessors()); File user_dump_dir = new File(datadir, "accounts"); user_dump_dir.mkdirs(); user_dump = new JsonDataset(user_dump_dir, USER_DUMP_FILE_PREFIX, new JsonDataset.Column[] { new JsonDataset.Column("id_str", false), new JsonDataset.Column("screen_name", true) }, "retrieval_date", DateParser.PATTERN_ISO8601MILLIS, JsonRepository.REWRITABLE_MODE, false, Integer.MAX_VALUE); followers_dump = new JsonDataset(user_dump_dir, FOLLOWERS_DUMP_FILE_PREFIX, new JsonDataset.Column[] { new JsonDataset.Column("screen_name", true) }, "retrieval_date", DateParser.PATTERN_ISO8601MILLIS, JsonRepository.REWRITABLE_MODE, false, Integer.MAX_VALUE); following_dump = new JsonDataset(user_dump_dir, FOLLOWING_DUMP_FILE_PREFIX, new JsonDataset.Column[] { new JsonDataset.Column("screen_name", true) }, "retrieval_date", DateParser.PATTERN_ISO8601MILLIS, JsonRepository.REWRITABLE_MODE, false, Integer.MAX_VALUE); Path log_dump_dir = dataPath.resolve("log"); log_dump_dir.toFile().mkdirs(); OS.protectPath(log_dump_dir); // no other permissions to this path access = new AccessTracker(log_dump_dir.toFile(), ACCESS_DUMP_FILE_PREFIX, 60000, 3000); access.start(); // start monitor import_profile_dump_dir = dataPath.resolve("import-profiles"); import_profile_dump = new JsonRepository(import_profile_dump_dir.toFile(), IMPORT_PROFILE_FILE_PREFIX, null, JsonRepository.COMPRESSED_MODE, false, Runtime.getRuntime().availableProcessors()); // load schema folder conv_schema_dir = new File("conf/conversion"); schema_dir = new File("conf/schema"); // load dictionaries if they are embedded here // read the file allCountries.zip from http://download.geonames.org/export/dump/allCountries.zip //File allCountries = new File(dictionaries, "allCountries.zip"); File cities1000 = new File(dictionaries, "cities1000.zip"); if (!cities1000.exists()) { // download this file ClientConnection.download("http://download.geonames.org/export/dump/cities1000.zip", cities1000); } if (cities1000.exists()) { try { geoNames = new GeoNames(cities1000, new File(conf_dir, "iso3166.json"), 1); } catch (IOException e) { Log.getLog().warn(e.getMessage()); cities1000.delete(); geoNames = null; } } // finally wait for healthy status of elasticsearch shards ClusterHealthStatus required_status = ClusterHealthStatus .fromString(config.get("elasticsearch_requiredClusterHealthStatus")); boolean ok; do { log("Waiting for elasticsearch " + required_status.name() + " status"); ok = elasticsearch_client.wait_ready(60000l, required_status); } while (!ok); /** do { log("Waiting for elasticsearch green status"); health = elasticsearch_client.admin().cluster().prepareHealth().setWaitForGreenStatus().execute().actionGet(); } while (health.isTimedOut()); **/ log("elasticsearch has started up!"); // start the classifier new Thread() { public void run() { log("initializing the classifier..."); try { Classifier.init(10000, 1000); } catch (Throwable e) { Log.getLog().warn(e); } log("classifier initialized!"); } }.start(); log("initializing queries..."); File harvestingPath = new File(datadir, "queries"); if (!harvestingPath.exists()) harvestingPath.mkdirs(); String[] list = harvestingPath.list(); for (String queryfile : list) { if (queryfile.startsWith(".") || queryfile.endsWith("~")) continue; try { BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(new File(harvestingPath, queryfile)))); String line; List<IndexEntry<QueryEntry>> bulkEntries = new ArrayList<>(); while ((line = reader.readLine()) != null) { line = line.trim().toLowerCase(); if (line.length() == 0) continue; if (line.charAt(0) <= '9') { // truncate statistic int p = line.indexOf(' '); if (p < 0) continue; line = line.substring(p + 1).trim(); } // write line into query database if (!existQuery(line)) { bulkEntries.add(new IndexEntry<QueryEntry>(line, SourceType.TWITTER, new QueryEntry(line, 0, 60000, SourceType.TWITTER, false))); } if (bulkEntries.size() > 1000) { queries.writeEntries(bulkEntries); bulkEntries.clear(); } } queries.writeEntries(bulkEntries); reader.close(); } catch (IOException e) { Log.getLog().warn(e); } } log("queries initialized."); log("finished DAO initialization"); } public static boolean wait_ready(long maxtimemillis) { ClusterHealthStatus required_status = ClusterHealthStatus .fromString(config.get("elasticsearch_requiredClusterHealthStatus")); return elasticsearch_client.wait_ready(maxtimemillis, required_status); } public static String pendingClusterTasks() { return elasticsearch_client.pendingClusterTasks(); } public static String clusterStats() { return elasticsearch_client.clusterStats(); } public static Map<String, String> nodeSettings() { return elasticsearch_client.nodeSettings(); } public static File getAssetFile(String screen_name, String id_str, String file) { String letter0 = ("" + screen_name.charAt(0)).toLowerCase(); String letter1 = ("" + screen_name.charAt(1)).toLowerCase(); File storage_path = new File(new File(new File(assets, letter0), letter1), screen_name); return new File(storage_path, id_str + "_" + file); // all assets for one user in one file } public static Collection<File> getTweetOwnDumps(int count) { return message_dump.getOwnDumps(count); } public static void importAccountDumps(int count) throws IOException { Collection<File> dumps = account_dump.getImportDumps(count); if (dumps == null || dumps.size() == 0) return; for (File dump : dumps) { JsonReader reader = account_dump.getDumpReader(dump); final JsonReader dumpReader = reader; Thread[] indexerThreads = new Thread[dumpReader.getConcurrency()]; for (int i = 0; i < dumpReader.getConcurrency(); i++) { indexerThreads[i] = new Thread() { public void run() { JsonFactory accountEntry; try { while ((accountEntry = dumpReader.take()) != JsonStreamReader.POISON_JSON_MAP) { try { JSONObject json = accountEntry.getJSON(); AccountEntry a = new AccountEntry(json); DAO.writeAccount(a, false); } catch (IOException e) { Log.getLog().warn(e); } } } catch (InterruptedException e) { Log.getLog().warn(e); } } }; indexerThreads[i].start(); } for (int i = 0; i < dumpReader.getConcurrency(); i++) { try { indexerThreads[i].join(); } catch (InterruptedException e) { } } account_dump.shiftProcessedDump(dump.getName()); } } /** * close all objects in this class */ public static void close() { Log.getLog().info("closing DAO"); // close the dump files message_dump.close(); account_dump.close(); import_profile_dump.close(); user_dump.close(); followers_dump.close(); following_dump.close(); // close the tracker access.close(); // close the index factories (flushes the caches) messages.close(); messages_hour.close(); messages_day.close(); messages_week.close(); users.close(); accounts.close(); queries.close(); importProfiles.close(); // close the index elasticsearch_client.close(); Log.getLog().info("closed DAO"); } /** * get values from * @param key * @param default_val * @return */ public static String getConfig(String key, String default_val) { String value = config.get(key); return value == null ? default_val : value; } public static String[] getConfig(String key, String[] default_val, String delim) { String value = config.get(key); return value == null || value.length() == 0 ? default_val : value.split(delim); } public static long getConfig(String key, long default_val) { String value = config.get(key); try { return value == null ? default_val : Long.parseLong(value); } catch (NumberFormatException e) { return default_val; } } public static double getConfig(String key, double default_val) { String value = config.get(key); try { return value == null ? default_val : Double.parseDouble(value); } catch (NumberFormatException e) { return default_val; } } public static JsonNode getSchema(String key) throws IOException { File schema = new File(schema_dir, key); if (!schema.exists()) { throw new FileNotFoundException("No schema file with name " + key + " found"); } return JsonLoader.fromFile(schema); } public static JSONObject getConversionSchema(String key) throws IOException { File schema = new File(conv_schema_dir, key); if (!schema.exists()) { throw new FileNotFoundException("No schema file with name " + key + " found"); } return new JSONObject(com.google.common.io.Files.toString(schema, Charsets.UTF_8)); } public static boolean getConfig(String key, boolean default_val) { String value = config.get(key); return value == null ? default_val : value.equals("true") || value.equals("on") || value.equals("1"); } public static Set<String> getConfigKeys() { return config.keySet(); } public static class MessageWrapper { public MessageEntry t; public UserEntry u; public boolean dump; public MessageWrapper(MessageEntry t, UserEntry u, boolean dump) { this.t = t; this.u = u; this.dump = dump; } } /** * Store a message together with a user into the search index * @param mw a message wrapper * @return true if the record was stored because it did not exist, false if it was not stored because the record existed already */ public static boolean writeMessage(MessageWrapper mw) { if (mw.t == null) return false; try { synchronized (DAO.class) { // record tweet into search index and check if this is a new entry // and check if the message exists boolean exists = false; if (mw.t.getCreatedAt().after(DateParser.oneHourAgo())) { exists = messages_hour .writeEntry(new IndexEntry<MessageEntry>(mw.t.getIdStr(), mw.t.getSourceType(), mw.t)); if (exists) return false; } if (mw.t.getCreatedAt().after(DateParser.oneDayAgo())) { exists = messages_day .writeEntry(new IndexEntry<MessageEntry>(mw.t.getIdStr(), mw.t.getSourceType(), mw.t)); if (exists) return false; } if (mw.t.getCreatedAt().after(DateParser.oneWeekAgo())) { exists = messages_week .writeEntry(new IndexEntry<MessageEntry>(mw.t.getIdStr(), mw.t.getSourceType(), mw.t)); if (exists) return false; } exists = messages .writeEntry(new IndexEntry<MessageEntry>(mw.t.getIdStr(), mw.t.getSourceType(), mw.t)); if (exists) return false; // write the user into the index users.writeEntry(new IndexEntry<UserEntry>(mw.u.getScreenName(), mw.t.getSourceType(), mw.u)); // record tweet into text file if (mw.dump) message_dump.write(mw.t.toJSON(mw.u, false, Integer.MAX_VALUE, "")); } // teach the classifier Classifier.learnPhrase(mw.t.getText(Integer.MAX_VALUE, "")); } catch (IOException e) { Log.getLog().warn(e); } return true; } public static Set<String> writeMessageBulk(Collection<MessageWrapper> mws) { List<MessageWrapper> noDump = new ArrayList<>(); List<MessageWrapper> dump = new ArrayList<>(); for (MessageWrapper mw : mws) { if (mw.t == null) continue; if (mw.dump) dump.add(mw); else noDump.add(mw); } Set<String> createdIDs = new HashSet<>(); createdIDs.addAll(writeMessageBulkNoDump(noDump)); createdIDs.addAll(writeMessageBulkDump(dump)); // does also do an writeMessageBulkNoDump internally return createdIDs; } /** * write messages without writing them to the dump file * @param mws a collection of message wrappers * @return a set of message IDs which had been created with this bulk write. */ private static Set<String> writeMessageBulkNoDump(Collection<MessageWrapper> mws) { if (mws.size() == 0) return new HashSet<>(); List<IndexEntry<UserEntry>> userBulk = new ArrayList<>(); List<IndexEntry<MessageEntry>> messageBulk = new ArrayList<>(); for (MessageWrapper mw : mws) { if (messages.existsCache(mw.t.getIdStr())) continue; // we omit writing this again synchronized (DAO.class) { // write the user into the index userBulk.add(new IndexEntry<UserEntry>(mw.u.getScreenName(), mw.t.getSourceType(), mw.u)); // record tweet into search index messageBulk.add(new IndexEntry<MessageEntry>(mw.t.getIdStr(), mw.t.getSourceType(), mw.t)); } // teach the classifier Classifier.learnPhrase(mw.t.getText(Integer.MAX_VALUE, "")); } ElasticsearchClient.BulkWriteResult result = null; try { final Date limitDate = new Date(); List<IndexEntry<MessageEntry>> macc; final Set<String> existed = new HashSet<>(); //DAO.log("***DEBUG messages INIT: " + messageBulk.size()); limitDate.setTime(DateParser.oneHourAgo().getTime()); macc = messageBulk.stream().filter(i -> i.getObject().getCreatedAt().after(limitDate)) .collect(Collectors.toList()); //DAO.log("***DEBUG messages for HOUR: " + macc.size()); result = messages_hour.writeEntries(macc); //DAO.log("***DEBUG messages for HOUR: " + result.getCreated().size() + " created"); for (IndexEntry<MessageEntry> i : macc) if (!(result.getCreated().contains(i.getId()))) existed.add(i.getId()); //DAO.log("***DEBUG messages for HOUR: " + existed.size() + " existed"); limitDate.setTime(DateParser.oneDayAgo().getTime()); macc = messageBulk.stream().filter(i -> !(existed.contains(i.getObject().getIdStr()))) .filter(i -> i.getObject().getCreatedAt().after(limitDate)).collect(Collectors.toList()); //DAO.log("***DEBUG messages for DAY : " + macc.size()); result = messages_day.writeEntries(macc); //DAO.log("***DEBUG messages for DAY: " + result.getCreated().size() + " created"); for (IndexEntry<MessageEntry> i : macc) if (!(result.getCreated().contains(i.getId()))) existed.add(i.getId()); //DAO.log("***DEBUG messages for DAY: " + existed.size() + " existed"); limitDate.setTime(DateParser.oneWeekAgo().getTime()); macc = messageBulk.stream().filter(i -> !(existed.contains(i.getObject().getIdStr()))) .filter(i -> i.getObject().getCreatedAt().after(limitDate)).collect(Collectors.toList()); //DAO.log("***DEBUG messages for WEEK: " + macc.size()); result = messages_week.writeEntries(macc); //DAO.log("***DEBUG messages for WEEK: " + result.getCreated().size() + " created"); for (IndexEntry<MessageEntry> i : macc) if (!(result.getCreated().contains(i.getId()))) existed.add(i.getId()); //DAO.log("***DEBUG messages for WEEK: " + existed.size() + " existed"); macc = messageBulk.stream().filter(i -> !(existed.contains(i.getObject().getIdStr()))) .collect(Collectors.toList()); //DAO.log("***DEBUG messages for ALL : " + macc.size()); result = messages.writeEntries(macc); //DAO.log("***DEBUG messages for ALL: " + result.getCreated().size() + " created"); for (IndexEntry<MessageEntry> i : macc) if (!(result.getCreated().contains(i.getId()))) existed.add(i.getId()); //DAO.log("***DEBUG messages for ALL: " + existed.size() + " existed"); users.writeEntries(userBulk); } catch (IOException e) { Log.getLog().warn(e); } if (result == null) return new HashSet<String>(); return result.getCreated(); } private static Set<String> writeMessageBulkDump(Collection<MessageWrapper> mws) { Set<String> created = writeMessageBulkNoDump(mws); for (MessageWrapper mw : mws) try { if (!created.contains(mw.t.getIdStr())) continue; synchronized (DAO.class) { // record tweet into text file message_dump.write(mw.t.toJSON(mw.u, false, Integer.MAX_VALUE, "")); } // teach the classifier Classifier.learnPhrase(mw.t.getText(Integer.MAX_VALUE, "")); } catch (IOException e) { Log.getLog().warn(e); } return created; } /** * Store an account together with a user into the search index * This method is synchronized to prevent concurrent IO caused by this call. * @param a an account * @param dump * @return true if the record was stored because it did not exist, false if it was not stored because the record existed already */ public static boolean writeAccount(AccountEntry a, boolean dump) { try { // record account into text file if (dump) account_dump.write(a.toJSON(null)); // record account into search index accounts.writeEntry(new IndexEntry<AccountEntry>(a.getScreenName(), a.getSourceType(), a)); } catch (IOException e) { Log.getLog().warn(e); } return true; } /** * Store an import profile into the search index * This method is synchronized to prevent concurrent IO caused by this call. * @param i an import profile * @return true if the record was stored because it did not exist, false if it was not stored because the record existed already */ public static boolean writeImportProfile(ImportProfileEntry i, boolean dump) { try { // record import profile into text file if (dump) import_profile_dump.write(i.toJSON()); // record import profile into search index importProfiles.writeEntry(new IndexEntry<ImportProfileEntry>(i.getId(), i.getSourceType(), i)); } catch (IOException e) { Log.getLog().warn(e); } return true; } private static long countLocalHourMessages(final long millis) { if (millis > 3600000L) return countLocalDayMessages(millis); return elasticsearch_client.count(IndexName.messages_hour.name(), "timestamp", millis == 3600000L ? -1 : millis); } private static long countLocalDayMessages(final long millis) { if (millis > 86400000L) return countLocalWeekMessages(millis); return elasticsearch_client.count(IndexName.messages_day.name(), "timestamp", millis == 3600000L ? -1 : millis); } private static long countLocalWeekMessages(final long millis) { if (millis > 604800000L) return countLocalMessages(millis); return elasticsearch_client.count(IndexName.messages_week.name(), "timestamp", millis == 3600000L ? -1 : millis); } public static long countLocalMessages(final long millis) { if (millis == 0) return 0; if (millis > 0) { if (millis <= 3600000L) return countLocalHourMessages(millis); if (millis <= 86400000L) return countLocalDayMessages(millis); if (millis <= 604800000L) return countLocalWeekMessages(millis); } return elasticsearch_client.count(IndexName.messages.name(), "timestamp", millis == Long.MAX_VALUE ? -1 : millis); } public static long countLocalMessages() { return elasticsearch_client.count(IndexName.messages.name(), "timestamp", -1); } public static long countLocalMessages(String provider_hash) { return elasticsearch_client.countLocal(IndexName.messages.name(), provider_hash); } public static long countLocalUsers() { return elasticsearch_client.count(IndexName.users.name(), "timestamp", -1); } public static long countLocalQueries() { return elasticsearch_client.count(IndexName.queries.name(), "timestamp", -1); } public static long countLocalAccounts() { return elasticsearch_client.count(IndexName.accounts.name(), "timestamp", -1); } public static MessageEntry readMessage(String id) throws IOException { MessageEntry m = null; return messages_hour != null && ((m = messages_hour.read(id)) != null) ? m : messages_day != null && ((m = messages_day.read(id)) != null) ? m : messages_week != null && ((m = messages_week.read(id)) != null) ? m : messages.read(id); } public static boolean existMessage(String id) { return messages_hour != null && messages_hour.exists(id) || messages_day != null && messages_day.exists(id) || messages_week != null && messages_week.exists(id) || messages != null && messages.exists(id); } public static boolean existUser(String id) { return users.exists(id); } public static boolean existQuery(String id) { return queries.exists(id); } public static boolean deleteQuery(String id, SourceType sourceType) { return queries.delete(id, sourceType); } public static boolean deleteImportProfile(String id, SourceType sourceType) { return importProfiles.delete(id, sourceType); } public static int deleteOld(IndexName indexName, Date createDateLimit) { RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery("created_at").to(createDateLimit); return elasticsearch_client.deleteByQuery(indexName.name(), rangeQuery); } public static class SearchLocalMessages { public Timeline timeline; public Map<String, List<Map.Entry<String, Long>>> aggregations; public ElasticsearchClient.Query query; /** * Search the local message cache using a elasticsearch query. * @param q - the query, for aggregation this which should include a time frame in the form since:yyyy-MM-dd until:yyyy-MM-dd * @param order_field - the field to order the results, i.e. Timeline.Order.CREATED_AT * @param timezoneOffset - an offset in minutes that is applied on dates given in the query of the form since:date until:date * @param resultCount - the number of messages in the result; can be zero if only aggregations are wanted * @param aggregationLimit - the maximum count of facet entities, not search results * @param aggregationFields - names of the aggregation fields. If no aggregation is wanted, pass no (zero) field(s) */ public SearchLocalMessages(final String q, Timeline.Order order_field, int timezoneOffset, int resultCount, int aggregationLimit, String... aggregationFields) { this.timeline = new Timeline(order_field); QueryEntry.ElasticsearchQuery sq = new QueryEntry.ElasticsearchQuery(q, timezoneOffset); long interval = sq.until.getTime() - sq.since.getTime(); IndexName resultIndex; boolean wholetime = aggregationFields.length > 0; if (wholetime) { if (q.contains("since:hour")) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages_hour).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); } else if (q.contains("since:day")) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages_day).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); } else if (q.contains("since:week")) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages_week).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); } else { this.query = elasticsearch_client.query((resultIndex = IndexName.messages).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); } } else { // use only a time frame that is sufficient for a result this.query = elasticsearch_client.query((resultIndex = IndexName.messages_hour).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); if (!q.contains("since:hour") && insufficient(this.query, resultCount, aggregationLimit, aggregationFields)) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages_day).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); if (!q.contains("since:day") && insufficient(this.query, resultCount, aggregationLimit, aggregationFields)) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages_week).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); if (!q.contains("since:week") && insufficient(this.query, resultCount, aggregationLimit, aggregationFields)) { this.query = elasticsearch_client.query((resultIndex = IndexName.messages).name(), sq.queryBuilder, order_field.getMessageFieldName(), timezoneOffset, resultCount, interval, "created_at", aggregationLimit, aggregationFields); } } } } timeline.setHits(query.hitCount); timeline.setResultIndex(resultIndex); // evaluate search result for (Map<String, Object> map : query.result) { MessageEntry tweet = new MessageEntry(new JSONObject(map)); try { UserEntry user = users.read(tweet.getScreenName()); assert user != null; if (user != null) { timeline.add(tweet, user); } } catch (IOException e) { Log.getLog().warn(e); } } this.aggregations = query.aggregations; } private static boolean insufficient(ElasticsearchClient.Query query, int resultCount, int aggregationLimit, String... aggregationFields) { return query.hitCount < resultCount || (aggregationFields.length > 0 && getAggregationResultLimit(query.aggregations) < aggregationLimit); } public JSONObject getAggregations() { JSONObject json = new JSONObject(true); if (aggregations == null) return json; for (Map.Entry<String, List<Map.Entry<String, Long>>> aggregation : aggregations.entrySet()) { JSONObject facet = new JSONObject(true); for (Map.Entry<String, Long> a : aggregation.getValue()) { if (a.getValue().equals(query)) continue; // we omit obvious terms that cannot be used for faceting, like search for "#abc" -> most hashtag is "#abc" facet.put(a.getKey(), a.getValue()); } json.put(aggregation.getKey(), facet); } return json; } private static int getAggregationResultLimit(Map<String, List<Map.Entry<String, Long>>> agg) { if (agg == null) return 0; int l = 0; for (List<Map.Entry<String, Long>> a : agg.values()) l = Math.max(l, a.size()); return l; } } public static LinkedHashMap<String, Long> FullDateHistogram(int timezoneOffset) { return elasticsearch_client.fullDateHistogram(IndexName.messages.name(), timezoneOffset, "created_at"); } /** * Search the local user cache using a elasticsearch query. * @param screen_name - the user id */ public static UserEntry searchLocalUserByScreenName(final String screen_name) { try { return users.read(screen_name); } catch (IOException e) { Log.getLog().warn(e); return null; } } public static UserEntry searchLocalUserByUserId(final String user_id) { if (user_id == null || user_id.length() == 0) return null; Map<String, Object> map = elasticsearch_client.query(IndexName.users.name(), UserEntry.field_user_id, user_id); if (map == null) return null; return new UserEntry(new JSONObject(map)); } /** * Search the local account cache using an elasticsearch query. * @param screen_name - the user id */ public static AccountEntry searchLocalAccount(final String screen_name) { try { return accounts.read(screen_name); } catch (IOException e) { Log.getLog().warn(e); return null; } } /** * Search the local message cache using a elasticsearch query. * @param q - the query, can be empty for a matchall-query * @param resultCount - the number of messages in the result * @param sort_field - the field name to sort the result list, i.e. "query_first" * @param sort_order - the sort order (you want to use SortOrder.DESC here) */ public static ResultList<QueryEntry> SearchLocalQueries(final String q, final int resultCount, final String sort_field, final String default_sort_type, final SortOrder sort_order, final Date since, final Date until, final String range_field) { ResultList<QueryEntry> queries = new ResultList<>(); ResultList<Map<String, Object>> result = elasticsearch_client.fuzzyquery(IndexName.queries.name(), "query", q, resultCount, sort_field, default_sort_type, sort_order, since, until, range_field); queries.setHits(result.getHits()); for (Map<String, Object> map : result) { queries.add(new QueryEntry(new JSONObject(map))); } return queries; } public static ImportProfileEntry SearchLocalImportProfiles(final String id) { try { return importProfiles.read(id); } catch (IOException e) { Log.getLog().warn(e); return null; } } public static Collection<ImportProfileEntry> SearchLocalImportProfilesWithConstraints( final Map<String, String> constraints, boolean latest) throws IOException { List<ImportProfileEntry> rawResults = new ArrayList<>(); List<Map<String, Object>> result = elasticsearch_client.queryWithConstraints( IndexName.import_profiles.name(), "active_status", ImportProfileEntry.EntryStatus.ACTIVE.name().toLowerCase(), constraints, latest); for (Map<String, Object> map : result) { rawResults.add(new ImportProfileEntry(new JSONObject(map))); } if (!latest) { return rawResults; } // filter results to display only latest profiles Map<String, ImportProfileEntry> latests = new HashMap<>(); for (ImportProfileEntry entry : rawResults) { String uniqueKey; if (entry.getImporter() != null) { uniqueKey = entry.getSourceUrl() + entry.getImporter(); } else { uniqueKey = entry.getSourceUrl() + entry.getClientHost(); } if (latests.containsKey(uniqueKey)) { if (entry.getLastModified().compareTo(latests.get(uniqueKey).getLastModified()) > 0) { latests.put(uniqueKey, entry); } } else { latests.put(uniqueKey, entry); } } return latests.values(); } public static Timeline scrapeTwitter(final Query post, final String q, final Timeline.Order order, final int timezoneOffset, boolean byUserQuery, long timeout, boolean recordQuery) { // retrieve messages from remote server ArrayList<String> remote = DAO.getFrontPeers(); Timeline tl; if (remote.size() > 0 && (peerLatency.get(remote.get(0)) == null || peerLatency.get(remote.get(0)).longValue() < 3000)) { long start = System.currentTimeMillis(); tl = searchOnOtherPeers(remote, q, order, 100, timezoneOffset, "all", SearchServlet.frontpeer_hash, timeout); // all must be selected here to catch up missing tweets between intervals // at this point the remote list can be empty as a side-effect of the remote search attempt if (post != null && remote.size() > 0 && tl != null) post.recordEvent("remote_scraper_on_" + remote.get(0), System.currentTimeMillis() - start); if (tl == null || tl.size() == 0) { // maybe the remote server died, we try then ourself start = System.currentTimeMillis(); tl = TwitterScraper.search(q, order, true, true, 400); if (post != null) post.recordEvent("local_scraper_after_unsuccessful_remote", System.currentTimeMillis() - start); } } else { if (post != null && remote.size() > 0) post.recordEvent("omitted_scraper_latency_" + remote.get(0), peerLatency.get(remote.get(0))); long start = System.currentTimeMillis(); tl = TwitterScraper.search(q, order, true, true, 400); if (post != null) post.recordEvent("local_scraper", System.currentTimeMillis() - start); } // record the query long start2 = System.currentTimeMillis(); QueryEntry qe = null; try { qe = queries.read(q); } catch (IOException | JSONException e) { Log.getLog().warn(e); } if (recordQuery && Caretaker.acceptQuery4Retrieval(q)) { if (qe == null) { // a new query occurred qe = new QueryEntry(q, timezoneOffset, tl.period(), SourceType.TWITTER, byUserQuery); } else { // existing queries are updated qe.update(tl.period(), byUserQuery); } try { queries.writeEntry(new IndexEntry<QueryEntry>(q, qe.source_type == null ? SourceType.TWITTER : qe.source_type, qe)); } catch (IOException e) { Log.getLog().warn(e); } } else { // accept rules may change, we want to delete the query then in the index if (qe != null) queries.delete(q, qe.source_type); } if (post != null) post.recordEvent("query_recorder", System.currentTimeMillis() - start2); //log("SCRAPER: TIME LEFT after recording = " + (termination - System.currentTimeMillis())); return tl; } public static final Random random = new Random(System.currentTimeMillis()); private static final Map<String, Long> peerLatency = new HashMap<>(); private static ArrayList<String> getBestPeers(Collection<String> peers) { ArrayList<String> best = new ArrayList<>(); if (peers == null || peers.size() == 0) return best; // first check if any of the given peers has unknown latency TreeMap<Long, String> o = new TreeMap<>(); for (String peer : peers) { if (peerLatency.containsKey(peer)) { o.put(peerLatency.get(peer) * 1000 + best.size(), peer); } else { best.add(peer); } } best.addAll(o.values()); return best; } public static void healLatency(float factor) { for (Map.Entry<String, Long> entry : peerLatency.entrySet()) { entry.setValue((long) (factor * entry.getValue())); } } private static Set<String> frontPeerCache = new HashSet<String>(); private static Set<String> backendPeerCache = new HashSet<String>(); public static void updateFrontPeerCache(RemoteAccess remoteAccess) { if (remoteAccess.getLocalHTTPPort() >= 80) { frontPeerCache.add("http://" + remoteAccess.getRemoteHost() + (remoteAccess.getLocalHTTPPort() == 80 ? "" : ":" + remoteAccess.getLocalHTTPPort())); } else if (remoteAccess.getLocalHTTPSPort() >= 443) { frontPeerCache.add("https://" + remoteAccess.getRemoteHost() + (remoteAccess.getLocalHTTPSPort() == 443 ? "" : ":" + remoteAccess.getLocalHTTPSPort())); } } /** * from all known front peers, generate a list of available peers, ordered by the peer latency * @return a list of front peers. only the first one shall be used, but the other are fail-over peers */ public static ArrayList<String> getFrontPeers() { String[] remote = DAO.getConfig("frontpeers", new String[0], ","); ArrayList<String> testpeers = new ArrayList<>(); if (remote.length > 0) { for (String peer : remote) testpeers.add(peer); return testpeers; } if (frontPeerCache.size() == 0) { // add dynamically all peers that contacted myself for (Map<String, RemoteAccess> hmap : RemoteAccess.history.values()) { for (Map.Entry<String, RemoteAccess> peer : hmap.entrySet()) { updateFrontPeerCache(peer.getValue()); } } } testpeers.addAll(frontPeerCache); return getBestPeers(testpeers); } public static List<String> getBackendPeers() { List<String> testpeers = new ArrayList<>(); if (backendPeerCache.size() == 0) { String[] remote = DAO.getConfig("backend", new String[0], ","); for (String peer : remote) backendPeerCache.add(peer); } testpeers.addAll(backendPeerCache); return getBestPeers(testpeers); } public static Timeline searchBackend(final String q, final Timeline.Order order, final int count, final int timezoneOffset, final String where, final long timeout) { List<String> remote = getBackendPeers(); if (remote .size() > 0 /*&& (peerLatency.get(remote.get(0)) == null || peerLatency.get(remote.get(0)) < 3000)*/) { // condition deactivated because we need always at least one peer Timeline tt = searchOnOtherPeers(remote, q, order, count, timezoneOffset, where, SearchServlet.backend_hash, timeout); if (tt != null) tt.writeToIndex(); return tt; } return null; } private final static Random randomPicker = new Random(System.currentTimeMillis()); public static Timeline searchOnOtherPeers(final List<String> remote, final String q, final Timeline.Order order, final int count, final int timezoneOffset, final String source, final String provider_hash, final long timeout) { // select remote peer while (remote.size() > 0) { int pick = randomPicker.nextInt(remote.size()); String peer = remote.get(pick); long start = System.currentTimeMillis(); try { Timeline tl = SearchServlet.search(peer, q, order, source, count, timezoneOffset, provider_hash, timeout); peerLatency.put(peer, System.currentTimeMillis() - start); // to show which peer was used for the retrieval, we move the picked peer to the front of the list if (pick != 0) remote.add(0, remote.remove(pick)); tl.setScraperInfo(tl.getScraperInfo().length() > 0 ? peer + "," + tl.getScraperInfo() : peer); return tl; } catch (IOException e) { DAO.log("searchOnOtherPeers: no IO to scraping target: " + e.getMessage()); // the remote peer seems to be unresponsive, remove it (temporary) from the remote peer list peerLatency.put(peer, 3600000L); frontPeerCache.remove(peer); backendPeerCache.remove(peer); remote.remove(pick); } } return null; } public final static Set<Number> newUserIds = new ConcurrentHashSet<>(); public static void announceNewUserId(Timeline tl) { for (MessageEntry message : tl) { UserEntry user = tl.getUser(message); assert user != null; if (user == null) continue; Number id = user.getUser(); if (id != null) announceNewUserId(id); } } public static void announceNewUserId(Number id) { JsonFactory mapcapsule = DAO.user_dump.get("id_str", id.toString()); JSONObject map = null; try { map = mapcapsule == null ? null : mapcapsule.getJSON(); } catch (IOException e) { } if (map == null) newUserIds.add(id); } public static Set<Number> getNewUserIdsChunk() { if (newUserIds.size() < 100) return null; Set<Number> chunk = new HashSet<>(); Iterator<Number> i = newUserIds.iterator(); for (int j = 0; j < 100; j++) { chunk.add(i.next()); i.remove(); } return chunk; } public static void log(String line) { Log.getLog().info(line); } public static void severe(String line) { Log.getLog().warn(line); } public static void severe(String line, Throwable e) { Log.getLog().warn(line, e); } public static void severe(Throwable e) { Log.getLog().warn(e); } }