Java tutorial
/******************************************************************************* * Copyright 2015, The IKANOW Open Source Project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package com.ikanow.aleph2.harvest.logstash.utils; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.text.ParseException; import java.text.SimpleDateFormat; import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Optional; import java.util.Spliterator; import java.util.Spliterators; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Charsets; import com.ikanow.aleph2.data_model.interfaces.data_import.IHarvestContext; import com.ikanow.aleph2.data_model.interfaces.data_services.IStorageService; import com.ikanow.aleph2.data_model.interfaces.shared_services.IBucketLogger; import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean; import com.ikanow.aleph2.data_model.objects.shared.BasicMessageBean; import com.ikanow.aleph2.data_model.objects.shared.GlobalPropertiesBean; import com.ikanow.aleph2.data_model.utils.BucketUtils; import com.ikanow.aleph2.data_model.utils.ErrorUtils; import com.ikanow.aleph2.data_model.utils.Optionals; import com.ikanow.aleph2.data_model.utils.Patterns; import com.ikanow.aleph2.data_model.utils.TimeUtils; import com.ikanow.aleph2.harvest.logstash.data_model.LogstashBucketConfigBean; import com.ikanow.aleph2.harvest.logstash.data_model.LogstashHarvesterConfigBean; import com.ikanow.aleph2.harvest.logstash.services.LogstashHarvestService; /** Utilities for manipulating logstash assets (config file code is separate) * Mostly copied from V1 * @author Alex */ public class LogstashUtils { private static final Logger _logger = LogManager.getLogger(); private static final String OUTPUT_FILE_SYNTAX = "ls_input_%{+yyyy.MM.dd.hh}_%{[@metadata][thread_id]}.json"; // (new file every minute unless flushed first) private static final String TEST_SEGMENT_PERIOD_OVERRIDE = "10"; private static final Integer DEFAULT_FLUSH_INTERVAL = 300; private static final String HDFS_NAMENODE_HTTP_ADDRESS = "dfs.namenode.http-address."; //dev.nn1 dev.nn2 etc private static final String HDFS_NAMESERVICES = "dfs.nameservices"; /** Builds a process to execute * @param global * @param bucket_config * @param logstash_config * @param requested_docs * @param bucket_path if this is present, will log output to /tmp/unique_sig * @param context * @return */ public static ProcessBuilder buildLogstashTest(final LogstashHarvesterConfigBean global, final LogstashBucketConfigBean bucket_config, final String logstash_config, final long requested_docs, final Optional<String> bucket_path) { final String log_file = System.getProperty("java.io.tmpdir") + File.separator + BucketUtils.getUniqueSignature(bucket_path.orElse("DNE"), Optional.empty()); try { //(delete log file if it exists) new File(log_file).delete(); } catch (Exception e) { } ArrayList<String> args = new ArrayList<String>(); args.addAll(Arrays.asList(global.binary_path(), "-e", logstash_config)); if (bucket_path.isPresent()) { args.addAll(Arrays.asList("-l", log_file)); } if (0L == requested_docs) { args.add("-t"); // test mode, must faster } //TESTED if (bucket_config.debug_verbosity()) { args.add("--debug"); } else { args.add("--verbose"); } ProcessBuilder logstashProcessBuilder = new ProcessBuilder(args); logstashProcessBuilder = logstashProcessBuilder.directory(new File(global.working_dir())) .redirectErrorStream(true); logstashProcessBuilder.environment().put("JAVA_OPTS", ""); return logstashProcessBuilder; } /** * @param type * @param bucket * @return * @throws IOException */ public static String getOutputTemplate(final String type, final DataBucketBean bucket, final IStorageService storage_service, final String hadoop_root_path, final IHarvestContext context, final LogstashBucketConfigBean config, final GlobalPropertiesBean globals) throws IOException { if (type.equals("hdfs")) { //if test bucket, override segment_time to be 10s instead of 60s (or allow user to spec in config block) // final String import_dir = hadoop_root_path + storage_service.getBucketRootPath() + bucket.full_name() + IStorageService.TO_IMPORT_DATA_SUFFIX + OUTPUT_FILE_SYNTAX; // final String temp_dir = hadoop_root_path + storage_service.getBucketRootPath() + bucket.full_name() + IStorageService.TEMP_DATA_SUFFIX + OUTPUT_FILE_SYNTAX; final String import_dir = (storage_service.getBucketRootPath() + bucket.full_name() + IStorageService.TO_IMPORT_DATA_SUFFIX + OUTPUT_FILE_SYNTAX).replaceAll("//", "/"); // final String temp_dir = storage_service.getBucketRootPath() + bucket.full_name() + IStorageService.TEMP_DATA_SUFFIX + OUTPUT_FILE_SYNTAX; final List<String> hdfs_server_url = getHDFSServerURL(globals); final String output = IOUtils .toString(LogstashHarvestService.class.getClassLoader().getResourceAsStream("output_hdfs.ls"), Charsets.UTF_8) // .replace("_XXX_TEMPORARY_PATH_XXX_", temp_dir) .replace("_XXX_PATH_XXX_", import_dir) .replace("_XXX_HOST1_XXX_", hdfs_server_url.get(0).substring(0, hdfs_server_url.get(0).indexOf(":"))) .replace("_XXX_PORT1_XXX_", hdfs_server_url.get(0).substring(hdfs_server_url.get(0).indexOf(":") + 1)) .replace("_XXX_HOST2_XXX_", hdfs_server_url.get(1).substring(0, hdfs_server_url.get(1).indexOf(":"))) .replace("_XXX_PORT2_XXX_", hdfs_server_url.get(1).substring(hdfs_server_url.get(1).indexOf(":") + 1)) .replace("_XXX_USER_XXX_", "tomcat") //TODO this should be a field in the HDFS config (see xxx_server_xxx) .replace("_XXX_IDLE_FLUSH_TIME_XXX_", BucketUtils.isTestBucket(bucket) ? TEST_SEGMENT_PERIOD_OVERRIDE : Optional.ofNullable(config.write_settings_override().batch_flush_interval()) .orElse(DEFAULT_FLUSH_INTERVAL).toString()) .replace("_XXX_FLUSH_SIZE_XXX_", Optional.ofNullable(config.write_settings_override().batch_max_objects()) .orElse(LogstashBucketConfigBean.DEFAULT_MAX_OBJECTS).toString()); return output; } else if (type.equals("elasticsearch")) { // Work out what the index naming is: //create the template context.getServiceContext().getSearchIndexService().get().getDataService().get() .getWritableDataService(JsonNode.class, bucket, Optional.empty(), Optional.empty()).get(); //replace out the elasticsearch-specific sub variables final Optional<String> grouping = Optionals .of(() -> bucket.data_schema().temporal_schema().grouping_time_period()); final String time_suffix = grouping .<ChronoUnit>flatMap(g -> TimeUtils.getTimePeriod(g) .<Optional<ChronoUnit>>validation(f -> Optional.empty(), s -> Optional.of(s))) .map(p -> TimeUtils.getTimeBasedSuffix(p, Optional.empty())).map(s -> "_%{+" + s + "}") //.map(s->"_%{+" + s.replaceAll("y", "Y") + "}") .orElse(""); final String output = IOUtils .toString(LogstashHarvestService.class.getClassLoader() .getResourceAsStream("output_elasticsearch.ls"), Charsets.UTF_8) .replace("_XXX_INDEX_XXX_", BucketUtils.getUniqueSignature(bucket.full_name(), Optional.empty()) + time_suffix); return output; } else return ""; } private static List<String> getHDFSServerURL(final GlobalPropertiesBean globals) { final Configuration config = getConfiguration(globals); //first get the dfs.nameservices final String dfs_name = config.get(HDFS_NAMESERVICES); return Arrays.asList(config.get(HDFS_NAMENODE_HTTP_ADDRESS + dfs_name + ".nn1"), config.get(HDFS_NAMENODE_HTTP_ADDRESS + dfs_name + ".nn2")); } /** * Retrieves the system configuration * (with code to handle possible internal concurrency bug in Configuration) * (tried putting a static synchronization around Configuration as an alternative) * @return */ protected static Configuration getConfiguration(final GlobalPropertiesBean globals) { for (int i = 0; i < 60; ++i) { try { return getConfiguration(globals, i); } catch (java.util.ConcurrentModificationException e) { final long to_sleep = Patterns.match(i).<Long>andReturn().when(ii -> ii < 15, __ -> 100L) .when(ii -> ii < 30, __ -> 250L).when(ii -> ii < 45, __ -> 500L).otherwise(__ -> 1000L) + (new Date().getTime() % 100L) // (add random component) ; try { Thread.sleep(to_sleep); } catch (Exception ee) { } if (59 == i) throw e; } } return null; } protected static Configuration getConfiguration(final GlobalPropertiesBean globals, final int attempt) { synchronized (Configuration.class) { Configuration config = new Configuration(false); if (new File(globals.local_yarn_config_dir()).exists()) { config.addResource(new Path(globals.local_yarn_config_dir() + "/yarn-site.xml")); config.addResource(new Path(globals.local_yarn_config_dir() + "/core-site.xml")); config.addResource(new Path(globals.local_yarn_config_dir() + "/hdfs-site.xml")); } else { final String alternative = System.getenv("HADOOP_CONF_DIR"); _logger.warn("Aleph2 yarn-config dir not found, try alternative: " + alternative); // (another alternative would be HADOOP_HOME + "/conf") if ((null != alternative) && new File(alternative).exists()) { config.addResource(new Path(alternative + "/yarn-site.xml")); config.addResource(new Path(alternative + "/core-site.xml")); config.addResource(new Path(alternative + "/hdfs-site.xml")); } else // last ditch - will work for local testing but never from anything remote config.addResource("default_fs.xml"); } if (attempt > 10) { // (try sleeping here) final long to_sleep = 500L + (new Date().getTime() % 100L); // (add random component) try { Thread.sleep(to_sleep); } catch (Exception e) { } } // These are not added by Hortonworks, so add them manually config.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem"); config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); config.set("fs.AbstractFileSystem.hdfs.impl", "org.apache.hadoop.fs.Hdfs"); config.set("fs.AbstractFileSystem.file.impl", "org.apache.hadoop.fs.local.LocalFs"); return config; } } private static final String logstash_colon_search = "=>:(\\w+)"; private static final String logstash_colon_replace = "=>\"$1\""; private static final String logstash_arrow_search = ":(\\w+)=>"; private static final String logstash_arrow_replace = "\"$1\":"; private static final String logstash_newline_search = "\\\\n"; private static final String logstash_newline_replace = " "; private static final String logstash_plugin_search = "plugin\":(.*)>,"; private static final ObjectMapper _mapper = new ObjectMapper(); final static Pattern logstash_plugin_pattern = Pattern.compile(logstash_plugin_search); //i dont think pattern is threadsafe, so recreate everytime /** * Reads the given output file and outputs it to the logger with the spec'd log level. * @param logger * @param level * @param output_file * @throws IOException */ public static void sendOutputToLogger(final IBucketLogger logger, final Level level, final File output_file, final Optional<Long> max_lines) throws IOException { // _logger.error("Reading output file: " + output_file + " to send to logger at level: " + level); Files.lines(output_file.toPath()).limit(max_lines.orElse(10000L)).forEach(line -> { try { //convert line to valid json, then parse json, build BMB object from it final String fixed_line = line.replaceAll(logstash_colon_search, logstash_colon_replace) .replaceAll(logstash_arrow_search, logstash_arrow_replace) .replaceAll(logstash_newline_search, logstash_newline_replace); final String plugin_fixed = fixPlugin(fixed_line); final ObjectNode line_object = (ObjectNode) _mapper.readTree(plugin_fixed); //move specific fields we want into BMB final Date date = parseLogstashDate(line_object.remove("timestamp").asText()); final Level logstash_level = Level.valueOf(line_object.remove("level").asText()); final String message = line_object.remove("message").asText(); //move everything else into details map logger.inefficientLog(logstash_level, new BasicMessageBean(date, true, LogstashHarvestService.class.getSimpleName(), "test_output", null, message, StreamSupport .stream(Spliterators.spliteratorUnknownSize(line_object.fields(), Spliterator.ORDERED), true) .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue().asText())))); } catch (Exception ex) { //fallback on conversion failure logger.inefficientLog(level, ErrorUtils .buildSuccessMessage(LogstashHarvestService.class.getSimpleName(), "test_output", line)); } }); //TODO should we delete log file after we've read it? } /** * @param fixed_line * @return */ private static String fixPlugin(final String fixed_line) { if (fixed_line.contains("\"plugin\":")) { final Matcher m = logstash_plugin_pattern.matcher(fixed_line); if (m.find()) { final String to_replace = "plugin\":\"" + m.group(1).replaceAll("\"", "\\\\\\\\\"") + ">\","; return fixed_line.replaceAll(logstash_plugin_search, to_replace); } } return fixed_line; //was no plugin, just return line } private static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSSSSZ"); /** * Parses logstash specific format into a date object, logstash dates look like: 2016-03-24T09:38:03.770000-0400 * * @param string * @return * @throws ParseException */ public static Date parseLogstashDate(final String date_string) throws ParseException { return formatter.parse(date_string); } }