com.cloudera.flume.handlers.hive.MarkerStore.java Source code

Introduction

Here is the source code for com.cloudera.flume.handlers.hive.MarkerStore.java
Source

/**
 * Licensed to Cloudera, Inc. under one
    
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.flume.handlers.hive;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.cli.CliDriver;
import org.apache.hadoop.hive.service.HiveClient;
import org.apache.hadoop.hive.service.HiveServerException;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;

import com.cloudera.flume.conf.FlumeConfiguration;
import com.google.common.io.CharStreams;

/**
 * Writes events the a file give a hadoop uri path. If no uri is specified It
 * defaults to the set by the given configured by fs.default.name config
 * variable. The user can specify an output format for the file. If none is
 * specified the default set by flume.collector.outputformat in the flume
 * configuration file is used.
 * 
 * 
 * 
 * TODO (jon) refactor this to be sane. Not happening now.
 */
public class MarkerStore {

    private static String hiveHost;
    private static int hivePort;
    private static TTransport transport;
    private static TProtocol protocol;
    HiveClient client;
    FlumeConfiguration conf;
    Path dstPath;
    String hiveTableName, hiveMarkerFolder, elasticsearchMarkerFolder, elasticsearchUrl;
    boolean runMarkerQueries;

    final static Logger LOG = Logger.getLogger(MarkerStore.class.getName());

    public MarkerStore(String hiveTableName, String elasticsearchUrl, boolean runMarkerQueries) {
        this.conf = FlumeConfiguration.get();

        hiveHost = conf.getHiveHost();
        hivePort = conf.getHivePort();

        transport = new TSocket(hiveHost, hivePort);
        protocol = new TBinaryProtocol(transport);
        client = new HiveClient(protocol);
        if (StringUtils.isNotEmpty(elasticsearchUrl)) {
            this.elasticsearchUrl = elasticsearchUrl;
            this.elasticsearchMarkerFolder = conf.getElasticSearchMarkerFolder();
            if (runMarkerQueries) {
                LOG.info("RUNNING ELASTICSEARCHMARKERQUERIES\n");
                runElasticSearchMarkerQueries();
            }
        }
        this.hiveTableName = hiveTableName;
        hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();
        try {
            if (!transport.isOpen()) {
                LOG.error("hive transport is closed, re-opening");
                transport = new TSocket(hiveHost, hivePort);
                protocol = new TBinaryProtocol(transport);
                client = new HiveClient(protocol);
                transport.open();
                if (runMarkerQueries) {
                    LOG.info("RUNNING HIVEMARKERQUERIES\n");
                    runHiveMarkerQueries();
                }

            }

        } catch (TTransportException e) {
            LOG.error("error opening transport layer to hive" + e.getMessage());
        }

    }

    private boolean runElasticSearchMarkerQueries() {
        boolean success = true;
        FileSystem hdfs;
        FSDataInputStream in;
        dstPath = new Path(elasticsearchMarkerFolder);
        LOG.info("DSTPATH: " + dstPath);
        try {
            hdfs = dstPath.getFileSystem(conf);
            if (hdfs.exists(dstPath)) {
                FileStatus[] fileListing = hdfs.listStatus(dstPath);
                for (FileStatus fs : fileListing) {
                    if (!fs.isDir()) {
                        LOG.info("File marker path: " + fs.getPath());
                        in = hdfs.open(fs.getPath());
                        byte[] fileData = new byte[(int) fs.getLen()];
                        in.readFully(fileData);
                        in.close();
                        LOG.info("cleaning markerfile @: " + fs.getPath().toString());
                        cleanMarkerFile(fs.getPath().toString());
                        sendESQuery(elasticsearchUrl, new String(fileData));

                    }
                }
            }
        } catch (Exception e) {
            success = false;
        }
        return success;
    }

    //you have indices, think of each index as a distributed database, a type is like a table in a database
    public boolean sendESQuery(String elasticSearchUrl, String sb) {
        boolean success = true;
        LOG.info("sending batched stringentities");
        LOG.info("elasticSearchUrl: " + elasticSearchUrl);
        try {

            HttpClient httpClient = new DefaultHttpClient();
            HttpPost httpPost = new HttpPost(elasticSearchUrl);
            StringEntity se = new StringEntity(sb);

            httpPost.setEntity(se);
            HttpResponse hr = httpClient.execute(httpPost);

            LOG.info("HTTP Response: " + hr.getStatusLine());
            LOG.info("Closing httpConnection");
            httpClient.getConnectionManager().shutdown();
            LOG.info("booooooo: " + CharStreams.toString(new InputStreamReader(se.getContent())));
        } catch (IOException e) {
            e.printStackTrace();
            success = false;
        } finally {
            if (!success) {
                LOG.info("ESQuery wasn't successful, writing to markerfolder");
                writeElasticSearchToMarkerFolder(new StringBuilder(sb));
            }
        }
        LOG.info("ESQuery was successful, yay!");
        return success;

    }

    private boolean writeElasticSearchToMarkerFolder(StringBuilder httpQuery) {
        FileSystem hdfs;
        try {
            String markerFolder = conf.getElasticSearchDefaultMarkerFolder();
            dstPath = new Path(markerFolder);
            hdfs = dstPath.getFileSystem(conf);
            if (!hdfs.exists(dstPath)) {
                hdfs.mkdirs(dstPath);
            }

            dstPath = new Path(markerFolder + "/es-" + System.currentTimeMillis() + ".marker");
            System.out.println("creating file at: " + dstPath.toString());
            FSDataOutputStream writer_marker = hdfs.create(dstPath);
            writer_marker.writeBytes(httpQuery + "\n");
            writer_marker.close();
            dstPath = null;
            writer_marker = null;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return false;
        }
        return true;
    }

    private boolean runHiveMarkerQueries() {
        boolean queryStatus = true;
        FileSystem hdfs;
        FSDataInputStream in;
        dstPath = new Path(hiveMarkerFolder);
        LOG.info("DSTPATH: " + dstPath);
        try {
            hdfs = dstPath.getFileSystem(conf);
            if (hdfs.exists(dstPath)) {
                FileStatus[] fileListing = hdfs.listStatus(dstPath);
                for (FileStatus fs : fileListing) {
                    if (!fs.isDir()) {
                        LOG.info("File marker path: " + fs.getPath());
                        in = hdfs.open(fs.getPath());
                        byte[] fileData = new byte[(int) fs.getLen()];
                        in.readFully(fileData);
                        String[] splitTab = new String(fileData).split("\t");
                        if (splitTab.length == 2) {
                            dstPath = new Path(splitTab[0]);
                            FileSystem hiveFile = dstPath.getFileSystem(conf);
                            if (hiveFile.exists(dstPath)) {
                                LOG.info("marker file data: " + splitTab[1]);
                                if (runHiveQuery(splitTab[1])) {
                                    LOG.info("Marker query is successful");
                                    in.close();
                                    cleanMarkerFile(fs.getPath().toString());
                                } else {
                                    LOG.info("Error running marker query, marker point not deleted");
                                    queryStatus = false;
                                }

                            } else {
                                LOG.info("marker points to invalid hive file location, deleting the marker");
                                in.close();
                                cleanMarkerFile(fs.getPath().toString());
                            }
                        }
                        //in.close();
                    }
                }
            }
            hdfs.close();
        } catch (IOException e) {
            LOG.error("ERROR running runMarkerQueries:" + e.getMessage());
        }

        return queryStatus;
    }

    public boolean cleanMarkerFile(String hiveMarkerPath) {
        LOG.debug("cleaning up hiveMarker: " + hiveMarkerPath);
        FileSystem localHdfs;
        Path deletePath = new Path(hiveMarkerPath);
        try {
            localHdfs = deletePath.getFileSystem(conf);
            if (localHdfs.delete(deletePath, false)) {
                LOG.debug("hiveMarker deleted successfully: " + hiveMarkerPath);
                return true;
            } else {
                LOG.error("error deleting hive marker: " + hiveMarkerPath);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            LOG.error("Error deleting hiveMarker: " + e.getMessage());
        }
        return false;
    }

    public boolean runHiveQuery(String query) {
        //    CliDriver clidriver = new CliDriver();
        //    LOG.error("QUery: " + query);
        //    int cliStatus = clidriver.processLine(query);
        //    LOG.error("cliStatus: " + cliStatus);

        try {
            if (!transport.isOpen()) {
                LOG.error("hive transport is closed, re-opening");
                transport = new TSocket(hiveHost, hivePort);
                protocol = new TBinaryProtocol(transport);
                client = new HiveClient(protocol);
                transport.open();

            }
            client.execute(query);
            transport.close();
            return true;
        } catch (TTransportException e) {
            // TODO Auto-generated catch block
            LOG.error("Error setting up transport with hive: " + e.getMessage());
            e.printStackTrace();
        } catch (HiveServerException e) {
            // TODO Auto-generated catch block
            LOG.error("HiveServerException: " + e.getMessage());
        } catch (TException e) {
            LOG.error("TException: " + e.getMessage());
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return false;
    }

    public boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder,
            String hiveMarkerPath) {
        LOG.debug("writing to hiveMarker: " + hiveMarkerFolder);

        FileSystem hdfs;
        dstPath = new Path(hiveMarkerFolder);
        try {
            hdfs = dstPath.getFileSystem(conf);

            if (!hdfs.exists(dstPath)) {
                hdfs.mkdirs(dstPath);
            }
            dstPath = new Path(hiveMarkerPath);
            FSDataOutputStream writer = hdfs.create(dstPath);
            writer.writeBytes(filePath + "\t" + hqlQuery + "\n");
            writer.close();
            dstPath = null;
            writer = null;

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        /*
          dstPath = new Path(hiveMarkerPath);
          hdfs = dstPath.getFileSystem(conf);
            
          writer = hdfs.create(dstPath);      
          writer.writeUTF(hqlQuery);
          writer.close();
          writer = null;
         */
        return true;

    }

    public boolean mergeFiles(String folder, Path file, String hiveOutputLocation) {
        FileSystem hdfs;
        FSDataInputStream in;
        FSDataOutputStream out;
        List<Path> fileCollection = new ArrayList<Path>();
        dstPath = new Path(folder);
        LOG.info("mergeFiles DSTPATH: " + dstPath);
        try {
            hdfs = dstPath.getFileSystem(conf);

            if (hdfs.exists(dstPath)) {
                FileStatus[] fileListing = hdfs.listStatus(dstPath);
                LOG.error("Creating file @: " + hiveOutputLocation);
                out = hdfs.create(new Path(hiveOutputLocation));

                in = hdfs.open(file);
                byte[] fileData = new byte[(int) hdfs.getFileStatus(file).getLen()];
                in.readFully(fileData);
                out.write(fileData);

                for (FileStatus fs : fileListing) {
                    if (!fs.isDir()) {
                        LOG.info("mergeFiles File marker path: " + fs.getPath());
                        fileCollection.add(fs.getPath());
                        in = hdfs.open(fs.getPath());
                        fileData = new byte[(int) fs.getLen()];
                        in.readFully(fileData);
                        out.write(fileData);
                    }
                }
                out.close();
            }

            hdfs.close();
            LOG.error("Written file: " + hiveOutputLocation);

            //lets start the purge process, delete all files except the merged file
            hdfs = dstPath.getFileSystem(conf);
            for (Path p : fileCollection) {
                if (hdfs.delete(p, false)) {
                    LOG.error("Successfully deleted: " + p);
                } else {
                    LOG.error("Error deleting file: " + p);
                }
            }

        } catch (IOException e) {
            LOG.error("ERROR running runMarkerQueries:" + e.getMessage());
        }
        LOG.error("mergeFiles Done merging files");
        return false;
    }

    public boolean checkIfPartitionExists(String filePath) {
        dstPath = new Path(filePath);
        FileSystem hdfs;
        try {
            hdfs = dstPath.getFileSystem(conf);
            return hdfs.exists(dstPath);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return false;
    }

}