eagle.dataproc.impl.storm.hdfs.UserProfileGenerationHDFSSpout.java Source code

Java tutorial

Introduction

Here is the source code for eagle.dataproc.impl.storm.hdfs.UserProfileGenerationHDFSSpout.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eagle.dataproc.impl.storm.hdfs;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.typesafe.config.Config;
import eagle.dataproc.core.StreamingProcessConstants;
import eagle.dataproc.core.ValuesArray;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.utils.Utils;

import eagle.dataproc.impl.storm.hdfs.HDFSSourcedStormSpoutProvider.HDFSSpout;
import com.esotericsoftware.minlog.Log;

public class UserProfileGenerationHDFSSpout extends HDFSSpout {

    private static final long serialVersionUID = 2274234104008894386L;
    private Config configContext;
    private TopologyContext _context;
    SpoutOutputCollector _collector;

    public class UserProfileData implements Serializable {
        private static final long serialVersionUID = -3315860110144736840L;
        private String user;
        private List<String> dateTime = new ArrayList<String>();
        private List<Integer> hrInDay = new ArrayList<Integer>();
        private List<String> line = new ArrayList<String>();

        public String getUser() {
            return user;
        }

        public void setUser(String user) {
            this.user = user;
        }

        public String getDateTime(int index) {
            return dateTime.get(index);
        }

        public List<String> getDateTimes() {
            return dateTime;
        }

        public void setDateTime(String dateTime) {
            this.dateTime.add(dateTime);
        }

        public int getHrInDay(int index) {
            return hrInDay.get(index);
        }

        public List<Integer> getHrsInDay() {
            return hrInDay;
        }

        public void setHrInDay(int hrInDay) {
            this.hrInDay.add(hrInDay);
        }

        public String getLine(int index) {
            return line.get(index);
        }

        public List<String> getLines() {
            return line;
        }

        public void setLine(String line) {
            this.line.add(line);
        }

    }

    private static final Logger LOG = LoggerFactory.getLogger(UserProfileGenerationHDFSSpout.class);

    public UserProfileGenerationHDFSSpout(Config configContext) {
        this.configContext = configContext;
        LOG.info("UserProfileGenerationHDFSSpout called");
    }

    public void copyFiles() {
        LOG.info("Inside listFiles()");
        //Configuration conf = new Configuration();
        JobConf conf = new JobConf();
        // _____________ TO TEST THAT CORRECT HADOOP JARs ARE INCLUDED __________________
        ClassLoader cl = ClassLoader.getSystemClassLoader();
        URL[] urls = ((URLClassLoader) cl).getURLs();
        if (LOG.isDebugEnabled()) {
            for (URL url : urls) {
                LOG.debug(url.getFile());
            }
        }
        // _________________________________________
        String hdfsConnectionStr = configContext.getString("dataSourceConfig.hdfsConnection");
        LOG.info("HDFS connection string: " + hdfsConnectionStr);

        String hdfsPath = configContext.getString("dataSourceConfig.hdfsPath");
        LOG.info("HDFS path: " + hdfsPath);

        String copyToPath = configContext.getString("dataSourceConfig.copyToPath");
        LOG.info("copyToPath: " + copyToPath);
        String srcPathStr = new String("hdfs://" + hdfsConnectionStr + hdfsPath);
        Path srcPath = new Path(srcPathStr);
        LOG.info("listFiles called");
        LOG.info("srcPath: " + srcPath);
        try {
            FileSystem fs = srcPath.getFileSystem(conf);
            /*CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); 
            CompressionCodec codec = codecFactory.getCodec(srcPath);
            DataInputStream inputStream = new DataInputStream(codec.createInputStream(fs.open(srcPath)));
            */

            Path destPath = new Path(copyToPath);
            LOG.info("Destination path: " + destPath);
            String userListFileName = configContext.getString("dataSourceConfig.userList");
            //loggerHDFSSpout.info("userListFileName: " + userListFileName);
            List<String> userList = getUser(userListFileName);
            for (String user : userList) {
                Path finalSrcPath = new Path(srcPath.getName() + "/" + user);
                fs.copyToLocalFile(finalSrcPath, destPath);
            }
            LOG.info("Copy to local succeed");
            fs.close();

        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

    private List<String> getAllFiles(String root, int level) {

        List<String> lists = new ArrayList<String>();
        File rootFile = new File(root);
        File[] tempList = rootFile.listFiles();
        if (tempList == null)
            return lists;

        for (File temp : tempList) {
            if (temp.isDirectory())
                lists.addAll(getAllFiles(temp.getAbsolutePath(), ++level));
            else {
                if (temp.getName().endsWith(".csv"))
                    lists.add(temp.getAbsolutePath());
            }
        }
        return lists;

    }

    public List<String> listFiles(String path) {

        LOG.info("Reading from: " + path);
        List<String> files = new ArrayList<String>();
        files = getAllFiles(path, 0);
        return files;
    }

    private List<String> getUser(String listFileName) {
        List<String> userList = new ArrayList<String>();
        BufferedReader reader = null;
        try {
            InputStream is = getClass().getResourceAsStream(listFileName);
            reader = new BufferedReader(new InputStreamReader(is));
            String line = "";
            while ((line = reader.readLine()) != null) {
                userList.add(line);
                LOG.info("User added:" + line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null)
                    reader.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return userList;
    }

    @Override
    public void nextTuple() {
        LOG.info("Releasing nextTuple");

        String userListFileName = configContext.getString("dataSourceConfig.userList");

        //loggerHDFSSpout.info("userListFileName: " + userListFileName);
        List<String> userList = getUser(userListFileName);
        //loggerHDFSSpout.info("user list size:" + userList.size());
        for (String user : userList) {
            LOG.info("Processing user: " + user);
            String copyToPath = configContext.getString("dataSourceConfig.copyToPath");
            //loggerHDFSSpout.info("copyToPath: " + copyToPath);

            copyToPath += "/" + user;
            List<String> files = listFiles(copyToPath);
            LOG.info("Files returned: " + files.size());
            String typeOfFile = configContext.getString("dataSourceConfig.fileFormat");
            //loggerHDFSSpout.info("typeOfFile returned: " + typeOfFile);
            UserProfileData usersProfileDataset = new UserProfileData();

            for (String fileName : files) {
                LOG.info("FileName: " + fileName);
                usersProfileDataset
                        .setDateTime(fileName.substring(fileName.lastIndexOf("/") + 1, fileName.lastIndexOf(".")));
                BufferedReader br = null;
                Reader decoder = null;
                InputStream inStream = null;

                try {
                    inStream = new FileInputStream(new File(fileName));
                    decoder = new InputStreamReader(inStream);
                    br = new BufferedReader(decoder);
                    int lineNo = 0;
                    String line = "";
                    while ((line = br.readLine()) != null) {
                        boolean containsFileHeader = configContext
                                .getBoolean("dataSourceConfig.containsFileHeader");
                        //loggerHDFSSpout.info("containsFileHeader returned: " + containsFileHeader);
                        if (containsFileHeader == true && lineNo == 0) {
                            // ignore the header column
                            lineNo++;
                            continue;
                        }
                        //loggerHDFSSpout.info("emitting line from file: " + fileName);

                        usersProfileDataset.setLine(line);
                        usersProfileDataset.setHrInDay(lineNo);
                        lineNo++;
                    }
                } catch (Exception e) {
                    Log.error("File operation failed");
                    throw new IllegalStateException();
                } finally {
                    try {
                        if (br != null)
                            br.close();
                        if (decoder != null)
                            decoder.close();
                        if (inStream != null)
                            inStream.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            usersProfileDataset.setUser(user);
            _collector.emit(new ValuesArray(user, "HDFSSourcedStormExecutor", usersProfileDataset));
            LOG.info("Emitting data of length: " + usersProfileDataset.getLines().size());
            Utils.sleep(1000);
        }
        this.close();
    }

    @Override
    public void open(Map arg0, TopologyContext context, SpoutOutputCollector collector) {
        _collector = collector;
        _context = context;
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        // TODO Auto-generated method stub
        declarer.declare(new Fields(StreamingProcessConstants.EVENT_PARTITION_KEY,
                StreamingProcessConstants.EVENT_STREAM_NAME, StreamingProcessConstants.EVENT_ATTRIBUTE_MAP));
    }

}