com.cloudera.flume.handlers.hdfs.CustomDfsSink.java Source code

Introduction

Here is the source code for com.cloudera.flume.handlers.hdfs.CustomDfsSink.java
Source

/**
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.flume.handlers.hdfs;

import java.io.IOException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.http.entity.StringEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.conf.FlumeSpecException;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.handlers.hive.MarkerStore;
import com.cloudera.flume.handlers.text.FormatFactory;
import com.cloudera.flume.handlers.text.output.OutputFormat;
import com.cloudera.flume.reporter.ReportEvent;
import com.google.common.base.Preconditions;

/**
 * This creates a raw hadoop dfs file that outputs data formatted by the
 * provided OutputFormat. It is assumed that the output is a file of some sort.
 */
public class CustomDfsSink extends EventSink.Base {
    static final Logger LOG = LoggerFactory.getLogger(CustomDfsSink.class);

    private static final String A_OUTPUTFORMAT = "recordformat";
    private static final DateFormat dateFormatDay = new SimpleDateFormat("yyyy-MM-dd");
    private static final DateFormat dateFormatHourMinuteSecond = new SimpleDateFormat("HH-mm-ss");
    private static final DateFormat dateFormatHourMinute = new SimpleDateFormat("HH-mm");
    private static final DateFormat dateFormatHour = new SimpleDateFormat("HH");

    List<StringEntity> stringEntities = new ArrayList<StringEntity>();
    boolean compressOutput, hiveOutput = false;
    OutputFormat format;
    OutputStream writer;
    AtomicLong count = new AtomicLong();
    String path;
    Path dstPath;
    String hiveTableName;
    String machineHostName;
    Calendar cal;
    Event localEvent;
    FlumeConfiguration conf;
    MarkerStore hup;
    String hiveMarkerFolder, hiveMarkerPath;

    StringBuilder sb = new StringBuilder();
    String elasticIndex, elasticType, elasticSearchUrl;
    boolean runMarkerQueries = false;

    public CustomDfsSink(String path, OutputFormat format) {
        Preconditions.checkArgument(path != null);
        Preconditions.checkArgument(format != null);
        this.path = path;
        this.format = format;
        this.writer = null;
    }

    public CustomDfsSink(String path, OutputFormat format, Event event) {

        Preconditions.checkArgument(path != null);
        Preconditions.checkArgument(format != null);
        this.path = path;
        this.format = format;
        this.writer = null;
        this.localEvent = event;
        cal = Calendar.getInstance();
        cal.setTimeInMillis(localEvent.getTimestamp());
        this.conf = FlumeConfiguration.get();
        try {
            machineHostName = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
            LOG.error("Error getting hostname for local machine: " + e.getMessage());
        }

    }

    public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName) {
        sb = new StringBuilder();

        Preconditions.checkArgument(path != null);
        Preconditions.checkArgument(format != null);
        this.path = path;
        this.localEvent = event;

        cal = Calendar.getInstance();
        cal.setTimeInMillis(localEvent.getTimestamp());
        this.format = format;
        this.writer = null;
        this.conf = FlumeConfiguration.get();
        this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();

        if (StringUtils.isNotBlank(hiveTableName)) {
            this.hiveOutput = true;
            this.hiveTableName = hiveTableName;
            hup = new MarkerStore(hiveTableName, null, false);
        }

        try {
            machineHostName = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
            LOG.error("Error getting hostname for local machine: " + e.getMessage());
        }
    }

    public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName,
            String elasticSearchUrl, String elasticIndex, String elasticType, boolean runMarkerQueries) {
        LOG.info("inside CUSTOMDFSSINK: URL: " + elasticSearchUrl + " Index: " + elasticIndex + " Type: "
                + elasticType, " RunMarkerQueries: " + runMarkerQueries);

        sb = new StringBuilder();
        this.elasticSearchUrl = elasticSearchUrl;
        //    if (StringUtils.indexOf(this.elasticSearchUrl, "/", this.elasticSearchUrl.length() - 1) > -1) {
        //      elasticSearchUrl = StringUtils.replaceOnce(StringUtils.reverse(elasticSearchUrl), "/", "");
        //    }
        this.elasticIndex = elasticIndex;
        this.elasticType = elasticType;
        this.runMarkerQueries = runMarkerQueries;

        Preconditions.checkArgument(path != null);
        Preconditions.checkArgument(format != null);
        this.path = path;
        this.localEvent = event;

        cal = Calendar.getInstance();
        cal.setTimeInMillis(localEvent.getTimestamp());
        this.format = format;
        this.writer = null;
        this.conf = FlumeConfiguration.get();
        this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();

        if (StringUtils.isNotBlank(hiveTableName)) {
            this.hiveOutput = true;
            this.hiveTableName = hiveTableName;
            this.elasticSearchUrl = elasticSearchUrl;
            hup = new MarkerStore(hiveTableName, elasticSearchUrl, runMarkerQueries);
        }

        try {
            machineHostName = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
            LOG.error("Error getting hostname for local machine: " + e.getMessage());
        }
    }

    @Override
    public void append(Event e) throws IOException {
        if (writer == null) {
            throw new IOException("Append failed, did you open the writer?");
        }
        sb.append("{ \"index\" : { \"_index\" : \"" + elasticIndex + "\", \"_type\" : \"" + elasticType
                + "\" } }\n{ \"" + elasticType + "\" : " + new String(e.getBody()) + " }\n");

        format.format(writer, e);
        count.getAndIncrement();
        super.append(e);

    }

    @Override
    public void close() {
        try {
            LOG.info("Closing HDFS file: " + dstPath);
            writer.flush();
            LOG.info("done writing raw file to hdfs");
            writer.close();

            if (StringUtils.isNotBlank(elasticSearchUrl) && StringUtils.isNotBlank(elasticIndex)
                    && StringUtils.isNotBlank(elasticType)) {
                hup.sendESQuery(elasticSearchUrl, sb.toString());
            }

            if (!deleteEmptyFile(dstPath)) {
                if (localEvent != null && hiveOutput) {
                    String dataFolder = StringUtils.substringBeforeLast(dstPath.toString(), "/");

                    //String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='" + dateFormatDay.format(cal.getTime()) + "', ts='" + dateFormatHour.format(cal.getTime()) + "') LOCATION '" + dataFolder + "'";
                    String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='"
                            + dateFormatDay.format(localEvent.getTimestamp()) + "', ts='"
                            + dateFormatHour.format(localEvent.getTimestamp()) + "') LOCATION '" + dataFolder + "'";

                    LOG.info("HQL Query: " + hqlQuery + "\n\n\n\n\n");

                    hiveMarkerPath = hiveMarkerFolder + "/" + machineHostName + "-" + localEvent.getTimestamp()
                            + ".marker";

                    if (!hup.runHiveQuery(hqlQuery)) {
                        writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath);
                    }
                    //      boolean hiveMarkerStatus = writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath);
                    //
                    //      if (hiveMarkerStatus) {
                    //        if (hup.runHiveQuery(hqlQuery)) {
                    //          hup.cleanHiveMarker(hiveMarkerPath);
                    //        }
                    //      }
                }
            } else {
                LOG.info("deleted empty file: " + dstPath);
            }
            localEvent = null;
            cal = null;

            writer = null;
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private boolean deleteEmptyFile(Path dstPath) {
        try {
            FileSystem fs = dstPath.getFileSystem(conf);
            if (fs.getFileStatus(dstPath).getLen() == 0) { //empty file, needs to be deleted
                LOG.info("empty file: " + dstPath);
                return fs.delete(dstPath, false);
            }

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        LOG.info("non-empty file: " + dstPath);
        return false;
    }

    private boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder,
            String hiveMarkerPath) {
        LOG.info("writing to hiveMarker: " + hiveMarkerFolder);
        LOG.info("hiveMarkerPath: " + hiveMarkerPath);

        FileSystem hdfs;
        dstPath = new Path(hiveMarkerFolder);
        try {
            hdfs = dstPath.getFileSystem(conf);

            if (!hdfs.exists(dstPath)) {
                hdfs.mkdirs(dstPath);
            }
            dstPath = new Path(hiveMarkerPath);
            FSDataOutputStream writer_marker = hdfs.create(dstPath);
            writer_marker.writeBytes(filePath + "\t" + hqlQuery + "\n");
            writer_marker.close();
            dstPath = null;
            writer_marker = null;

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return true;

    }

    @Override
    public void open() throws IOException {
        FlumeConfiguration conf = FlumeConfiguration.get();
        FileSystem hdfs;

        // use v0.9.1 compression settings
        if (conf.getCollectorDfsCompressGzipStatus()) {
            LOG.warn("Config property " + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_GZIP
                    + " is deprecated, please use " + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_CODEC
                    + " set to GzipCodec instead");
            CompressionCodec gzipC = new GzipCodec();
            Compressor gzCmp = gzipC.createCompressor();
            dstPath = new Path(path + gzipC.getDefaultExtension());
            hdfs = dstPath.getFileSystem(conf);
            writer = hdfs.create(dstPath);
            writer = gzipC.createOutputStream(writer, gzCmp);
            LOG.info("Creating HDFS gzip compressed file: " + dstPath.toString());
            return;
        }

        String codecName = conf.getCollectorDfsCompressCodec();
        List<Class<? extends CompressionCodec>> codecs = CompressionCodecFactory
                .getCodecClasses(FlumeConfiguration.get());
        CompressionCodec codec = null;
        ArrayList<String> codecStrs = new ArrayList<String>();
        codecStrs.add("None");
        for (Class<? extends CompressionCodec> cls : codecs) {
            codecStrs.add(cls.getSimpleName());

            if (cls.getSimpleName().equals(codecName)) {
                try {
                    codec = cls.newInstance();
                } catch (InstantiationException e) {
                    LOG.error("Unable to instantiate " + codec + " class");
                } catch (IllegalAccessException e) {
                    LOG.error("Unable to access " + codec + " class");
                }
            }
        }

        if (codec == null) {
            if (!codecName.equals("None")) {
                LOG.warn("Unsupported compression codec " + codecName + ".  Please choose from: " + codecStrs);
            }
            dstPath = new Path(path);
            hdfs = dstPath.getFileSystem(conf);
            writer = hdfs.create(dstPath);
            LOG.info("Creating HDFS file: " + dstPath.toString());
            return;
        }

        Compressor cmp = codec.createCompressor();
        dstPath = new Path(path + codec.getDefaultExtension());
        hdfs = dstPath.getFileSystem(conf);
        writer = hdfs.create(dstPath);
        try {
            writer = codec.createOutputStream(writer, cmp);
        } catch (NullPointerException npe) {
            // tries to find "native" version of codec, if that fails, then tries to
            // find java version. If there is no java version, the createOutpuStream
            // exits via NPE. We capture this and convert it into a IOE with a more
            // useful error message.
            LOG.error("Unable to load compression codec " + codec);
            throw new IOException("Unable to load compression codec " + codec);
        }
        LOG.info("Creating " + codec + " compressed HDFS file: " + dstPath.toString());
    }

    public static SinkBuilder builder() {
        return new SinkBuilder() {
            @Override
            public EventSink build(Context context, String... args) {
                if (args.length != 2 && args.length != 1) {
                    // TODO (jon) make this message easier.
                    throw new IllegalArgumentException(
                            "usage: customdfs(\"[(hdfs|file|s3n|...)://namenode[:port]]/path\", \"format\")");
                }

                String format = (args.length == 1) ? null : args[1];
                OutputFormat fmt;
                try {
                    fmt = FormatFactory.get().getOutputFormat(format);
                } catch (FlumeSpecException e) {
                    LOG.error("failed to load format " + format, e);
                    throw new IllegalArgumentException("failed to load format " + format);
                }
                return new CustomDfsSink(args[0], fmt, null);
            }
        };
    }

    @Override
    public String getName() {
        return "CustomDfs";
    }

    @Override
    public ReportEvent getReport() {
        ReportEvent rpt = super.getReport();
        rpt.setStringMetric(A_OUTPUTFORMAT, format.getBuilder().getName());
        rpt.setLongMetric(ReportEvent.A_COUNT, count.get());
        return rpt;
    }
}