Java tutorial
/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.handlers.hdfs; import java.io.IOException; import java.io.OutputStream; import java.net.InetAddress; import java.net.UnknownHostException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.http.entity.StringEntity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.conf.Context; import com.cloudera.flume.conf.FlumeConfiguration; import com.cloudera.flume.conf.FlumeSpecException; import com.cloudera.flume.conf.SinkFactory.SinkBuilder; import com.cloudera.flume.core.Event; import com.cloudera.flume.core.EventSink; import com.cloudera.flume.handlers.hive.MarkerStore; import com.cloudera.flume.handlers.text.FormatFactory; import com.cloudera.flume.handlers.text.output.OutputFormat; import com.cloudera.flume.reporter.ReportEvent; import com.google.common.base.Preconditions; /** * This creates a raw hadoop dfs file that outputs data formatted by the * provided OutputFormat. It is assumed that the output is a file of some sort. */ public class CustomDfsSink extends EventSink.Base { static final Logger LOG = LoggerFactory.getLogger(CustomDfsSink.class); private static final String A_OUTPUTFORMAT = "recordformat"; private static final DateFormat dateFormatDay = new SimpleDateFormat("yyyy-MM-dd"); private static final DateFormat dateFormatHourMinuteSecond = new SimpleDateFormat("HH-mm-ss"); private static final DateFormat dateFormatHourMinute = new SimpleDateFormat("HH-mm"); private static final DateFormat dateFormatHour = new SimpleDateFormat("HH"); List<StringEntity> stringEntities = new ArrayList<StringEntity>(); boolean compressOutput, hiveOutput = false; OutputFormat format; OutputStream writer; AtomicLong count = new AtomicLong(); String path; Path dstPath; String hiveTableName; String machineHostName; Calendar cal; Event localEvent; FlumeConfiguration conf; MarkerStore hup; String hiveMarkerFolder, hiveMarkerPath; StringBuilder sb = new StringBuilder(); String elasticIndex, elasticType, elasticSearchUrl; boolean runMarkerQueries = false; public CustomDfsSink(String path, OutputFormat format) { Preconditions.checkArgument(path != null); Preconditions.checkArgument(format != null); this.path = path; this.format = format; this.writer = null; } public CustomDfsSink(String path, OutputFormat format, Event event) { Preconditions.checkArgument(path != null); Preconditions.checkArgument(format != null); this.path = path; this.format = format; this.writer = null; this.localEvent = event; cal = Calendar.getInstance(); cal.setTimeInMillis(localEvent.getTimestamp()); this.conf = FlumeConfiguration.get(); try { machineHostName = InetAddress.getLocalHost().getHostName(); } catch (UnknownHostException e) { LOG.error("Error getting hostname for local machine: " + e.getMessage()); } } public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName) { sb = new StringBuilder(); Preconditions.checkArgument(path != null); Preconditions.checkArgument(format != null); this.path = path; this.localEvent = event; cal = Calendar.getInstance(); cal.setTimeInMillis(localEvent.getTimestamp()); this.format = format; this.writer = null; this.conf = FlumeConfiguration.get(); this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder(); if (StringUtils.isNotBlank(hiveTableName)) { this.hiveOutput = true; this.hiveTableName = hiveTableName; hup = new MarkerStore(hiveTableName, null, false); } try { machineHostName = InetAddress.getLocalHost().getHostName(); } catch (UnknownHostException e) { LOG.error("Error getting hostname for local machine: " + e.getMessage()); } } public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName, String elasticSearchUrl, String elasticIndex, String elasticType, boolean runMarkerQueries) { LOG.info("inside CUSTOMDFSSINK: URL: " + elasticSearchUrl + " Index: " + elasticIndex + " Type: " + elasticType, " RunMarkerQueries: " + runMarkerQueries); sb = new StringBuilder(); this.elasticSearchUrl = elasticSearchUrl; // if (StringUtils.indexOf(this.elasticSearchUrl, "/", this.elasticSearchUrl.length() - 1) > -1) { // elasticSearchUrl = StringUtils.replaceOnce(StringUtils.reverse(elasticSearchUrl), "/", ""); // } this.elasticIndex = elasticIndex; this.elasticType = elasticType; this.runMarkerQueries = runMarkerQueries; Preconditions.checkArgument(path != null); Preconditions.checkArgument(format != null); this.path = path; this.localEvent = event; cal = Calendar.getInstance(); cal.setTimeInMillis(localEvent.getTimestamp()); this.format = format; this.writer = null; this.conf = FlumeConfiguration.get(); this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder(); if (StringUtils.isNotBlank(hiveTableName)) { this.hiveOutput = true; this.hiveTableName = hiveTableName; this.elasticSearchUrl = elasticSearchUrl; hup = new MarkerStore(hiveTableName, elasticSearchUrl, runMarkerQueries); } try { machineHostName = InetAddress.getLocalHost().getHostName(); } catch (UnknownHostException e) { LOG.error("Error getting hostname for local machine: " + e.getMessage()); } } @Override public void append(Event e) throws IOException { if (writer == null) { throw new IOException("Append failed, did you open the writer?"); } sb.append("{ \"index\" : { \"_index\" : \"" + elasticIndex + "\", \"_type\" : \"" + elasticType + "\" } }\n{ \"" + elasticType + "\" : " + new String(e.getBody()) + " }\n"); format.format(writer, e); count.getAndIncrement(); super.append(e); } @Override public void close() { try { LOG.info("Closing HDFS file: " + dstPath); writer.flush(); LOG.info("done writing raw file to hdfs"); writer.close(); if (StringUtils.isNotBlank(elasticSearchUrl) && StringUtils.isNotBlank(elasticIndex) && StringUtils.isNotBlank(elasticType)) { hup.sendESQuery(elasticSearchUrl, sb.toString()); } if (!deleteEmptyFile(dstPath)) { if (localEvent != null && hiveOutput) { String dataFolder = StringUtils.substringBeforeLast(dstPath.toString(), "/"); //String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='" + dateFormatDay.format(cal.getTime()) + "', ts='" + dateFormatHour.format(cal.getTime()) + "') LOCATION '" + dataFolder + "'"; String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='" + dateFormatDay.format(localEvent.getTimestamp()) + "', ts='" + dateFormatHour.format(localEvent.getTimestamp()) + "') LOCATION '" + dataFolder + "'"; LOG.info("HQL Query: " + hqlQuery + "\n\n\n\n\n"); hiveMarkerPath = hiveMarkerFolder + "/" + machineHostName + "-" + localEvent.getTimestamp() + ".marker"; if (!hup.runHiveQuery(hqlQuery)) { writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath); } // boolean hiveMarkerStatus = writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath); // // if (hiveMarkerStatus) { // if (hup.runHiveQuery(hqlQuery)) { // hup.cleanHiveMarker(hiveMarkerPath); // } // } } } else { LOG.info("deleted empty file: " + dstPath); } localEvent = null; cal = null; writer = null; } catch (IOException e) { e.printStackTrace(); } } private boolean deleteEmptyFile(Path dstPath) { try { FileSystem fs = dstPath.getFileSystem(conf); if (fs.getFileStatus(dstPath).getLen() == 0) { //empty file, needs to be deleted LOG.info("empty file: " + dstPath); return fs.delete(dstPath, false); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } LOG.info("non-empty file: " + dstPath); return false; } private boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder, String hiveMarkerPath) { LOG.info("writing to hiveMarker: " + hiveMarkerFolder); LOG.info("hiveMarkerPath: " + hiveMarkerPath); FileSystem hdfs; dstPath = new Path(hiveMarkerFolder); try { hdfs = dstPath.getFileSystem(conf); if (!hdfs.exists(dstPath)) { hdfs.mkdirs(dstPath); } dstPath = new Path(hiveMarkerPath); FSDataOutputStream writer_marker = hdfs.create(dstPath); writer_marker.writeBytes(filePath + "\t" + hqlQuery + "\n"); writer_marker.close(); dstPath = null; writer_marker = null; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return true; } @Override public void open() throws IOException { FlumeConfiguration conf = FlumeConfiguration.get(); FileSystem hdfs; // use v0.9.1 compression settings if (conf.getCollectorDfsCompressGzipStatus()) { LOG.warn("Config property " + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_GZIP + " is deprecated, please use " + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_CODEC + " set to GzipCodec instead"); CompressionCodec gzipC = new GzipCodec(); Compressor gzCmp = gzipC.createCompressor(); dstPath = new Path(path + gzipC.getDefaultExtension()); hdfs = dstPath.getFileSystem(conf); writer = hdfs.create(dstPath); writer = gzipC.createOutputStream(writer, gzCmp); LOG.info("Creating HDFS gzip compressed file: " + dstPath.toString()); return; } String codecName = conf.getCollectorDfsCompressCodec(); List<Class<? extends CompressionCodec>> codecs = CompressionCodecFactory .getCodecClasses(FlumeConfiguration.get()); CompressionCodec codec = null; ArrayList<String> codecStrs = new ArrayList<String>(); codecStrs.add("None"); for (Class<? extends CompressionCodec> cls : codecs) { codecStrs.add(cls.getSimpleName()); if (cls.getSimpleName().equals(codecName)) { try { codec = cls.newInstance(); } catch (InstantiationException e) { LOG.error("Unable to instantiate " + codec + " class"); } catch (IllegalAccessException e) { LOG.error("Unable to access " + codec + " class"); } } } if (codec == null) { if (!codecName.equals("None")) { LOG.warn("Unsupported compression codec " + codecName + ". Please choose from: " + codecStrs); } dstPath = new Path(path); hdfs = dstPath.getFileSystem(conf); writer = hdfs.create(dstPath); LOG.info("Creating HDFS file: " + dstPath.toString()); return; } Compressor cmp = codec.createCompressor(); dstPath = new Path(path + codec.getDefaultExtension()); hdfs = dstPath.getFileSystem(conf); writer = hdfs.create(dstPath); try { writer = codec.createOutputStream(writer, cmp); } catch (NullPointerException npe) { // tries to find "native" version of codec, if that fails, then tries to // find java version. If there is no java version, the createOutpuStream // exits via NPE. We capture this and convert it into a IOE with a more // useful error message. LOG.error("Unable to load compression codec " + codec); throw new IOException("Unable to load compression codec " + codec); } LOG.info("Creating " + codec + " compressed HDFS file: " + dstPath.toString()); } public static SinkBuilder builder() { return new SinkBuilder() { @Override public EventSink build(Context context, String... args) { if (args.length != 2 && args.length != 1) { // TODO (jon) make this message easier. throw new IllegalArgumentException( "usage: customdfs(\"[(hdfs|file|s3n|...)://namenode[:port]]/path\", \"format\")"); } String format = (args.length == 1) ? null : args[1]; OutputFormat fmt; try { fmt = FormatFactory.get().getOutputFormat(format); } catch (FlumeSpecException e) { LOG.error("failed to load format " + format, e); throw new IllegalArgumentException("failed to load format " + format); } return new CustomDfsSink(args[0], fmt, null); } }; } @Override public String getName() { return "CustomDfs"; } @Override public ReportEvent getReport() { ReportEvent rpt = super.getReport(); rpt.setStringMetric(A_OUTPUTFORMAT, format.getBuilder().getName()); rpt.setLongMetric(ReportEvent.A_COUNT, count.get()); return rpt; } }