eagle.dataproc.impl.storm.hdfs.DataCollectionHDFSSpout.java Source code

Java tutorial

Introduction

Here is the source code for eagle.dataproc.impl.storm.hdfs.DataCollectionHDFSSpout.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eagle.dataproc.impl.storm.hdfs;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.*;
import java.util.zip.GZIPInputStream;

import com.typesafe.config.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.utils.Utils;

import eagle.dataproc.impl.storm.hdfs.HDFSSourcedStormSpoutProvider.HDFSSpout;

public class DataCollectionHDFSSpout extends HDFSSpout {

    private static final long serialVersionUID = 8775646842131298552L;
    private Config configContext;
    private TopologyContext _context;
    SpoutOutputCollector _collector;
    private Map<String, Boolean> processFileMap = null;
    private static final Logger LOG = LoggerFactory.getLogger(DataCollectionHDFSSpout.class);

    public DataCollectionHDFSSpout(Config configContext) {
        this.configContext = configContext;
        processFileMap = new HashMap<String, Boolean>();
        LOG.info("DataCollectionHDFSSpout called");

    }

    public void copyFiles() {
        LOG.info("Inside listFiles()");
        Configuration conf = new Configuration();
        // _____________ TO TEST THAT CORRECT HADOOP JARs ARE INCLUDED __________________
        ClassLoader cl = ClassLoader.getSystemClassLoader();
        URL[] urls = ((URLClassLoader) cl).getURLs();
        if (LOG.isDebugEnabled()) {
            for (URL url : urls) {
                LOG.debug(url.getFile());
            }
        }
        // _________________________________________
        String hdfsConnectionStr = configContext.getString("dataSourceConfig.hdfsConnnection");
        LOG.info("HDFS connection string: " + hdfsConnectionStr);

        String hdfsPath = configContext.getString("dataSourceConfig.hdfsPath");
        LOG.info("HDFS path: " + hdfsPath);

        String copyToPath = configContext.getString("dataSourceConfig.copyToPath");
        LOG.info("copyToPath: " + copyToPath);
        String srcPathStr = new String("hdfs://" + hdfsConnectionStr + hdfsPath);
        Path srcPath = new Path(srcPathStr);
        LOG.info("listFiles called");
        LOG.info("srcPath: " + srcPath);
        try {
            FileSystem fs = srcPath.getFileSystem(conf);
            /*CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); 
            CompressionCodec codec = codecFactory.getCodec(srcPath);
            DataInputStream inputStream = new DataInputStream(codec.createInputStream(fs.open(srcPath)));
            */

            Path destPath = new Path(copyToPath);
            LOG.info("Destination path: " + destPath);
            fs.copyToLocalFile(srcPath, destPath);
            LOG.info("Copy to local succeed");
            fs.close();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    private List<String> getAllFiles(String root, int level) {

        List<String> lists = new ArrayList<String>();
        File rootFile = new File(root);
        File[] tempList = rootFile.listFiles();
        if (tempList == null)
            return lists;

        for (File temp : tempList) {
            if (temp.isDirectory())
                lists.addAll(getAllFiles(temp.getAbsolutePath(), ++level));
            else {
                if (temp.getName().endsWith(".gz") || temp.getName().endsWith(".csv"))
                    lists.add(temp.getAbsolutePath());
            }
        }
        return lists;

    }

    public List<String> listFiles() {

        String copyToPath = configContext.getString("dataSourceConfig.copyToPath");
        LOG.info("Reading from: " + copyToPath);
        List<String> files = new ArrayList<String>();
        files = getAllFiles(copyToPath, 0);
        return files;
    }

    @Override
    public void nextTuple() {
        LOG.info("Releasing nextTuple");
        List<String> files = listFiles();
        LOG.info("Files returned: " + files.size());
        String typeOfFile = configContext.getString("dataSourceConfig.fileFormat");
        LOG.info("typeOfFile returned: " + typeOfFile);

        for (String fileName : files) {
            LOG.info("fileName: " + fileName);
            LOG.info("processFileMap.get(fileName): " + processFileMap.get(fileName));
            if (processFileMap.get(fileName) == null || processFileMap.get(fileName) == false) {
                processFileMap.put(fileName, true);
                BufferedReader br = null;
                Reader decoder = null;
                GZIPInputStream in = null;
                InputStream inStream = null;

                try {
                    if (typeOfFile.equalsIgnoreCase("GZIP")) {
                        in = new GZIPInputStream(new FileInputStream(new File(fileName)));
                        decoder = new InputStreamReader(in);
                    } else if (typeOfFile.equalsIgnoreCase("CSV")) {
                        inStream = new FileInputStream(new File(fileName));
                        decoder = new InputStreamReader(inStream);
                    } else {
                        LOG.error("No known file type specified");
                        continue;
                    }

                    br = new BufferedReader(decoder);
                    int lineNo = 0;
                    String line = "";
                    while ((line = br.readLine()) != null) {
                        ++lineNo;
                        //String line = br.readLine();
                        //loggerHDFSSpout.info("line number " + lineNo + "is: " + line);
                        //if(line == null || line.equalsIgnoreCase(""))
                        //   break;
                        LOG.info("Emitting line from file: " + fileName);
                        //_collector.emit(new ValuesArray(line), lineNo);
                        _collector.emit(Arrays.asList((Object) line));
                        LOG.info("Emitted line no: " + lineNo + " and line: " + line);
                        Utils.sleep(100);
                    }
                } catch (Exception e) {
                    // TODO: handle exception
                    e.printStackTrace();
                } finally {
                    try {
                        if (br != null)
                            br.close();
                        if (decoder != null)
                            decoder.close();
                        if (in != null)
                            in.close();
                        if (inStream != null)
                            inStream.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            } else {
                LOG.info("Processed the files before, already done! ");
                //Utils.sleep(10000);
            }

        }

    }

    public void fail(Object msgId) {
        int transactionId = (Integer) msgId;
        LOG.info(transactionId + " failed");
    }

    public void ack(Object msgId) {
        int transactionId = (Integer) msgId;
        LOG.info(transactionId + " acknowledged");
    }

    @Override
    public void open(Map arg0, TopologyContext context, SpoutOutputCollector collector) {
        _collector = collector;
        _context = context;
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        // TODO Auto-generated method stub
        declarer.declare(new Fields("line"));
    }
}