Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.hops.examples.spark.kafka; import io.hops.util.Hops; import io.hops.util.exceptions.SchemaNotFoundException; import io.hops.util.spark.SparkConsumer; import java.io.IOException; import java.net.MalformedURLException; import java.net.ProtocolException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.lang.StringUtils; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.Time; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.json.JSONObject; /** * Consumes log messages from one or more topics in Kafka, generates a json to elasticsearch index and archives data * in Parquet format in HDFS. * <p> * Usage: StreamingKafkaElastic <sink> * <sink> location in hdfs to append streaming output * <p> * Example: /Projects/MyProject/Sink/Data * <p> */ public final class StreamingLogs { private static final Logger LOG = Logger.getLogger(StreamingLogs.class.getName()); public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName()); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); //Use applicationId for sink folder final String appId = jssc.sparkContext().getConf().getAppId(); SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate(); //Get consumer groups Properties props = new Properties(); props.put("value.deserializer", StringDeserializer.class.getName()); props.put("client.id", Hops.getJobName()); SparkConsumer consumer = Hops.getSparkConsumer(jssc, props); //Store processed offsets // Create direct kafka stream with topics JavaInputDStream<ConsumerRecord<String, String>> messages = consumer.createDirectStream(); //Convert line to JSON JavaDStream<NamenodeLogEntry> logEntries = messages .map(new Function<ConsumerRecord<String, String>, JSONObject>() { @Override public JSONObject call(ConsumerRecord<String, String> record) throws SchemaNotFoundException, MalformedURLException, ProtocolException { LOG.log(Level.INFO, "record:{0}", record); return parser(record.value(), appId); } }).map(new Function<JSONObject, NamenodeLogEntry>() { @Override public NamenodeLogEntry call(JSONObject json) throws SchemaNotFoundException, MalformedURLException, ProtocolException, IOException { NamenodeLogEntry logEntry = new NamenodeLogEntry( json.getString("message").replace("\n\t", "\n").replace("\n", "---"), json.getString("priority"), json.getString("logger_name"), json.getString("timestamp"), json.getString("file")); LOG.log(Level.INFO, "NamenodeLogEntry:{0}", logEntry); return logEntry; } }); //logEntries.print(); logEntries.foreachRDD(new VoidFunction2<JavaRDD<NamenodeLogEntry>, Time>() { @Override public void call(JavaRDD<NamenodeLogEntry> rdd, Time time) throws Exception { Dataset<Row> row = sparkSession.createDataFrame(rdd, NamenodeLogEntry.class); if (!rdd.isEmpty()) { row.write().mode(SaveMode.Append) .parquet("/Projects/" + Hops.getProjectName() + "/Resources/LogAnalysis"); } } }); /* * Enable this to get all the streaming outputs. It creates a folder for * every microbatch slot. * /////////////////////////////////////////////////////////////////////// * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class, * String.class, (Class) TextOutputFormat.class); * /////////////////////////////////////////////////////////////////////// */ // Start the computation jssc.start(); Hops.shutdownGracefully(jssc); } private static JSONObject parser(String line, String appId) { JSONObject jsonLog = new JSONObject(line); JSONObject index = new JSONObject(); String priority, logger, thread, timestamp; priority = logger = thread = timestamp = null; //Sample line: String[] attrs = jsonLog.getString("message") .substring(0, StringUtils.ordinalIndexOf(jsonLog.getString("message"), " ", 4)).split(" "); String message = jsonLog.getString("message") .substring(StringUtils.ordinalIndexOf(jsonLog.getString("message"), " ", 4) + 1); try { priority = attrs[2]; logger = attrs[3]; //thread = attrs[5]; timestamp = attrs[0] + " " + attrs[1]; //Convert timestamp to appropriate format DateFormat df = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS"); Date result = df.parse(timestamp); Locale currentLocale = Locale.getDefault(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", currentLocale); timestamp = format.format(result); } catch (Exception ex) { LOG.log(Level.WARNING, "Error while parsing log, setting default index parameters:{0}", ex.getMessage()); message = jsonLog.getString("message"); priority = "parse error"; logger = "parse error"; //thread = "parse error"; timestamp = "parse error"; } index.put("message", message); index.put("priority", priority); index.put("logger_name", logger); index.put("timestamp", timestamp); index.put("application", appId); index.put("host", jsonLog.getJSONObject("beat").getString("hostname")); index.put("project", Hops.getProjectName()); index.put("jobname", Hops.getJobName()); if (jsonLog.getString("source").contains("/")) { index.put("file", jsonLog.getString("source").substring(jsonLog.getString("source").lastIndexOf("/") + 1)); } else { index.put("file", jsonLog.getString("source")); } return index; } }