com.inmobi.conduit.local.LocalStreamService.java Source code

Introduction

Here is the source code for com.inmobi.conduit.local.LocalStreamService.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.inmobi.conduit.local;

import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import com.inmobi.conduit.ConduitConfig;
import com.inmobi.conduit.ConduitConstants;
import com.inmobi.conduit.ConfigConstants;
import com.inmobi.conduit.SourceStream;
import com.inmobi.conduit.utils.CalendarHelper;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import com.inmobi.audit.thrift.AuditMessage;
import com.inmobi.conduit.distcp.tools.DistCpConstants;
import com.inmobi.conduit.distcp.tools.mapred.UniformSizeInputFormat;
import com.inmobi.conduit.metrics.ConduitMetrics;
import com.inmobi.conduit.AbstractService;
import com.inmobi.conduit.CheckpointProvider;
import com.inmobi.conduit.Cluster;
import com.inmobi.conduit.utils.FileUtil;

/*
 * Handles Local Streams for a Cluster
 * Assumptions
 * (i) One LocalStreamService per Cluster
 */

public class LocalStreamService extends AbstractService implements ConfigConstants {

    private static final Log LOG = LogFactory.getLog(LocalStreamService.class);

    private final Cluster srcCluster;
    private Cluster currentCluster = null;
    private Path tmpPath;
    private Path tmpJobInputPath;
    private Path tmpJobOutputPath;
    private final int FILES_TO_KEEP = 6;
    private int filesPerCollector = 10;
    private long timeoutToProcessLastCollectorFile = 60;
    private boolean processLastFile = false;
    private int numberOfFilesProcessed = 0;

    // The amount of data expected to be processed by each mapper, such that
    // each map task completes within ~20 seconds. This calculation is based
    // on assumption that the map task processing throughput is ~25 MB/s.
    protected long BYTES_PER_MAPPER = 512 * 1024 * 1024;
    private final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64);
    private DataInputBuffer in = new DataInputBuffer();

    // these paths are used to set the path of input format jar in job conf
    private final Path jarsPath;
    final Path inputFormatJarDestPath;
    final Path auditUtilJarDestPath;

    public LocalStreamService(ConduitConfig config, Cluster srcCluster, Cluster currentCluster,
            CheckpointProvider provider, Set<String> streamsToProcess) throws IOException {
        super("LocalStreamService_" + srcCluster + "_" + getServiceName(streamsToProcess), config,
                DEFAULT_RUN_INTERVAL, provider, streamsToProcess);
        this.srcCluster = srcCluster;
        if (currentCluster == null)
            this.currentCluster = srcCluster;
        else
            this.currentCluster = currentCluster;
        this.tmpPath = new Path(srcCluster.getTmpPath(), getName());
        this.tmpJobInputPath = new Path(tmpPath, "jobIn");
        this.tmpJobOutputPath = new Path(tmpPath, "jobOut");
        this.tmpCounterOutputPath = new Path(tmpPath, "counters");
        jarsPath = new Path(srcCluster.getTmpPath(), "jars");
        inputFormatJarDestPath = new Path(jarsPath, "conduit-distcp-current.jar");
        auditUtilJarDestPath = new Path(jarsPath, "messaging-client-core.jar");

        String numOfFilesPerCollector = System.getProperty(ConduitConstants.FILES_PER_COLLECETOR_PER_LOCAL_STREAM);
        if (numOfFilesPerCollector != null) {
            filesPerCollector = Integer.parseInt(numOfFilesPerCollector);
        }

        String timeoutToProcessLastFile = System
                .getProperty(ConduitConstants.TIMEOUT_TO_PROCESS_LAST_COLLECTOR_FILE);
        if (timeoutToProcessLastFile != null) {
            timeoutToProcessLastCollectorFile = Long.parseLong(timeoutToProcessLastFile);
        }

        //register metrics
        for (String eachStream : streamsToProcess) {
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), RETRY_CHECKPOINT, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), RETRY_MKDIR, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), RETRY_RENAME, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), EMPTYDIR_CREATE, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), FILES_COPIED_COUNT, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), RUNTIME, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), FAILURES, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), COMMIT_TIME, eachStream);
            ConduitMetrics.registerAbsoluteGauge(getServiceType(), LAST_FILE_PROCESSED, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_ADD_PARTITIONS_COUNT, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_CONNECTION_FAILURES, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_ALREADY_EXISTS_EXCEPTION, eachStream);
            ConduitMetrics.registerSlidingWindowGauge(getServiceType(), JOB_EXECUTION_TIME, eachStream);
        }
    }

    private void cleanUpTmp(FileSystem fs) throws Exception {
        if (fs.exists(tmpPath)) {
            LOG.info("Deleting tmpPath recursively [" + tmpPath + "]");
            fs.delete(tmpPath, true);
        }
    }

    @Override
    protected void prepareStreamHcatEnableMap() {
        Map<String, SourceStream> sourceStreamMap = config.getSourceStreams();
        for (String stream : streamsToProcess) {
            if (sourceStreamMap.containsKey(stream) && sourceStreamMap.get(stream).isHCatEnabled()) {
                streamHcatEnableMap.put(stream, true);
                Set<Path> paths = Collections.synchronizedSortedSet(new TreeSet<Path>());
                pathsToBeregisteredPerTable.put(getTableName(stream), paths);
            } else {
                streamHcatEnableMap.put(stream, false);
            }
        }
        LOG.info("Hcat enable map for local stream : " + streamHcatEnableMap);
    }

    @Override
    protected Date getTimeStampFromHCatPartition(String lastHcatPartitionLoc, String stream) {
        String streamRootDirPrefix = new Path(srcCluster.getLocalFinalDestDirRoot(), stream).toString();
        Date lastAddedPartitionDate = CalendarHelper.getDateFromStreamDir(streamRootDirPrefix,
                lastHcatPartitionLoc);
        return lastAddedPartitionDate;
    }

    protected String getTableName(String streamName) {
        if (streamTableNameMap.containsKey(streamName)) {
            return streamTableNameMap.get(streamName);
        } else {
            StringBuilder sb = new StringBuilder();
            sb.append(LOCAL_TABLE_PREFIX);
            sb.append(TABLE_NAME_SEPARATOR);
            sb.append(streamName);
            streamTableNameMap.put(streamName, sb.toString());
            return sb.toString();
        }
    }

    @Override
    public long getMSecondsTillNextRun(long currentTime) {
        return (long) (DEFAULT_RUN_INTERVAL - (long) (currentTime % DEFAULT_RUN_INTERVAL));
    }

    @Override
    protected void execute() throws Exception {
        lastProcessedFile.clear();
        List<AuditMessage> auditMsgList = new ArrayList<AuditMessage>();
        try {
            FileSystem fs = FileSystem.get(srcCluster.getHadoopConf());
            // Cleanup tmpPath before everyRun to avoid
            // any old data being used in this run if the old run was aborted
            cleanUpTmp(fs);
            LOG.info("TmpPath is [" + tmpPath + "]");
            long commitTime = srcCluster.getCommitTime();
            publishMissingPaths(fs, srcCluster.getLocalFinalDestDirRoot(), commitTime, streamsToProcess);
            Map<FileStatus, String> fileListing = new TreeMap<FileStatus, String>();
            Set<FileStatus> trashSet = new HashSet<FileStatus>();
            /* checkpointPaths table contains streamname as rowkey,
            source(collector) name as column key and checkpoint value as value */
            Table<String, String, String> checkpointPaths = HashBasedTable.create();

            long totalSize = createMRInput(tmpJobInputPath, fileListing, trashSet, checkpointPaths);

            if (fileListing.size() == 0) {
                LOG.info("Nothing to do!");
                for (String eachStream : streamsToProcess) {
                    if (lastProcessedFile.get(eachStream) != null) {
                        ConduitMetrics.updateAbsoluteGauge(getServiceType(), LAST_FILE_PROCESSED, eachStream,
                                lastProcessedFile.get(eachStream));
                    }
                }
                return;
            }
            Job job = createJob(tmpJobInputPath, totalSize);
            long jobStartTime = System.nanoTime();
            job.waitForCompletion(true);
            long jobExecutionTimeInSecs = (System.nanoTime() - jobStartTime) / (NANO_SECONDS_IN_SECOND);
            LOG.info("Time taken to complete " + job.getJobID() + " job : " + jobExecutionTimeInSecs + "secs");
            updateJobTimeCounter(jobExecutionTimeInSecs);
            if (job.isSuccessful()) {
                commitTime = srcCluster.getCommitTime();
                LOG.info("Commiting mvPaths and ConsumerPaths");

                commit(prepareForCommit(commitTime), false, auditMsgList, commitTime);
                updatePathsTobeRegisteredWithLatestDir(commitTime);
                checkPoint(checkpointPaths);
                LOG.info("Commiting trashPaths");
                commit(populateTrashCommitPaths(trashSet), true, null, commitTime);
                LOG.info("Committed successfully at " + getLogDateString(commitTime));
                for (String eachStream : streamsToProcess) {
                    if (lastProcessedFile.get(eachStream) != null) {
                        ConduitMetrics.updateAbsoluteGauge(getServiceType(), LAST_FILE_PROCESSED, eachStream,
                                lastProcessedFile.get(eachStream));
                    }
                }
            } else {
                throw new IOException("LocaStreamService job failure: Job " + job.getJobID() + " has failed. ");
            }
        } catch (Exception e) {
            LOG.warn("Error in running LocalStreamService ", e);
            throw e;
        } finally {
            publishAuditMessages(auditMsgList);
            try {
                registerPartitions();
            } catch (Exception e) {
                LOG.warn("Got exception while registering partitions. ", e);
            }
        }
    }

    private void updatePathsTobeRegisteredWithLatestDir(long commitTime) throws IOException {
        for (String eachStream : streamsToProcess) {
            if (isStreamHCatEnabled(eachStream)) {
                String path = srcCluster.getLocalDestDir(eachStream, commitTime);
                pathsToBeregisteredPerTable.get(getTableName(eachStream)).add(new Path(path));
            }
        }
    }

    private void checkPoint(Table<String, String, String> checkPointPaths) throws Exception {
        Set<String> streams = checkPointPaths.rowKeySet();
        for (String streamName : streams) {
            Map<String, String> collectorCheckpointValueMap = checkPointPaths.row(streamName);
            for (String collector : collectorCheckpointValueMap.keySet()) {
                String checkpointKey = getCheckPointKey(getClass().getSimpleName(), streamName, collector);
                LOG.debug("Check Pointing Key [" + checkpointKey + "] with value ["
                        + collectorCheckpointValueMap.get(collector) + "]");
                retriableCheckPoint(checkpointProvider, checkpointKey,
                        collectorCheckpointValueMap.get(collector).getBytes(), streamName);
            }
        }
        checkPointPaths.clear();
    }

    Map<Path, Path> prepareForCommit(long commitTime) throws Exception {
        FileSystem fs = FileSystem.get(srcCluster.getHadoopConf());

        // find final destination paths
        Map<Path, Path> mvPaths = new LinkedHashMap<Path, Path>();
        FileStatus[] categories;
        try {
            categories = fs.listStatus(tmpJobOutputPath);
        } catch (FileNotFoundException e) {
            categories = new FileStatus[0];
        }
        for (FileStatus categoryDir : categories) {
            String categoryName = categoryDir.getPath().getName();
            Path destDir = new Path(srcCluster.getLocalDestDir(categoryName, commitTime));
            FileStatus[] files;
            try {
                files = fs.listStatus(categoryDir.getPath());
            } catch (FileNotFoundException e) {
                files = new FileStatus[0];
            }
            for (FileStatus file : files) {
                Path destPath = new Path(destDir, file.getPath().getName());
                LOG.debug("Moving [" + file.getPath() + "] to [" + destPath + "]");
                mvPaths.put(file.getPath(), destPath);
            }
        }
        publishMissingPaths(fs, srcCluster.getLocalFinalDestDirRoot(), commitTime, streamsToProcess);
        return mvPaths;
    }

    Map<Path, Path> populateTrashCommitPaths(Set<FileStatus> trashSet) {
        // find trash paths
        Map<Path, Path> trashPaths = new TreeMap<Path, Path>();
        Path trash = srcCluster.getTrashPathWithDateHour();
        Iterator<FileStatus> it = trashSet.iterator();
        while (it.hasNext()) {
            FileStatus src = it.next();
            Path target = null;
            target = new Path(trash, src.getPath().getParent().getName() + "-" + src.getPath().getName());
            LOG.debug("Trashing [" + src.getPath() + "] to [" + target + "]");
            trashPaths.put(src.getPath(), target);
        }
        return trashPaths;
    }

    /*
     * Trash paths: srcPath:  hdfsUri/conduit/data/<streamname>/<collectorname>/<fileName>
     *              destPath: hdfsUri/conduit/system/trash/yyyy-MM-dd/HH/<filename>
     *
     * local stream Paths:
     *  srcPath: hdfsUri/conduit/system/tmp/<localStreamservicename>/jobout/<streamName>/<fileName>
     *  destPath: hdfsUri/conduit/streams_local/<streamName>/yyyy/MM/dd/HH/mm/<fileName>
     */
    private void commit(Map<Path, Path> commitPaths, boolean isTrashData, List<AuditMessage> auditMsgList,
            long commitTime) throws Exception {
        LOG.info("Committing " + commitPaths.size() + " paths.");
        long startTime = System.currentTimeMillis();
        FileSystem fs = FileSystem.get(srcCluster.getHadoopConf());
        if (!isTrashData) {
            for (String stream : streamsToProcess) {
                Path finalPath = new Path(srcCluster.getLocalDestDir(stream, commitTime));
                LOG.info("Creating commit time minute directory " + finalPath + " for " + stream);
                retriableMkDirs(fs, finalPath, stream);
            }
        }

        Table<String, Long, Long> parsedCounters = parseCountersFile(fs);
        for (Map.Entry<Path, Path> entry : commitPaths.entrySet()) {
            LOG.info("Renaming " + entry.getKey() + " to " + entry.getValue());
            String streamName = null;
            /*
             * finding streamname from srcPaths for committing trash paths as we don't
             * have streamname in destPath.
             * finding streamname from dest path for other paths
             */
            if (!isTrashData) {
                streamName = getTopicNameFromDestnPath(entry.getValue());
            } else {
                streamName = getCategoryFromSrcPath(entry.getKey());
            }
            retriableMkDirs(fs, entry.getValue().getParent(), streamName);
            if (retriableRename(fs, entry.getKey(), entry.getValue(), streamName) == false) {
                LOG.warn("Rename failed, aborting transaction COMMIT to avoid "
                        + "dataloss. Partial data replay could happen in next run");
                throw new Exception("Abort transaction Commit. Rename failed from [" + entry.getKey() + "] to ["
                        + entry.getValue() + "]");
            }
            if (!isTrashData) {
                String filename = entry.getKey().getName();
                generateAuditMsgs(streamName, filename, parsedCounters, auditMsgList);
                ConduitMetrics.updateSWGuage(getServiceType(), FILES_COPIED_COUNT, streamName, 1);
            }
        }
        long elapsedTime = System.currentTimeMillis() - startTime;
        LOG.debug("Committed " + commitPaths.size() + " paths.");
        for (String eachStream : streamsToProcess) {
            ConduitMetrics.updateSWGuage(getServiceType(), COMMIT_TIME, eachStream, elapsedTime);
        }
    }

    protected long createMRInput(Path inputPath, Map<FileStatus, String> fileListing, Set<FileStatus> trashSet,
            Table<String, String, String> checkpointPaths) throws IOException {
        FileSystem fs = FileSystem.get(srcCluster.getHadoopConf());

        createListing(fs, fs.getFileStatus(srcCluster.getDataDir()), fileListing, trashSet, checkpointPaths);
        // the total size of data present in all files
        long totalSize = 0;
        // if file listing is empty, simply return
        if (fileListing.isEmpty()) {
            return 0;
        }
        SequenceFile.Writer out = SequenceFile.createWriter(fs, srcCluster.getHadoopConf(), inputPath, Text.class,
                FileStatus.class);
        try {
            Iterator<Entry<FileStatus, String>> it = fileListing.entrySet().iterator();
            while (it.hasNext()) {
                Entry<FileStatus, String> entry = it.next();
                FileStatus status = FileUtil.getFileStatus(entry.getKey(), buffer, in);
                out.append(new Text(entry.getValue()), status);

                // Create a sync point after each entry. This will ensure that
                // SequenceFile
                // Reader can work at file entry level granularity, given that
                // SequenceFile
                // Reader reads from the starting of sync point.
                out.sync();

                totalSize += entry.getKey().getLen();
            }
        } finally {
            out.close();
        }

        return totalSize;
    }

    public static class CollectorPathFilter implements PathFilter {
        public boolean accept(Path path) {
            if (path.getName().endsWith("current") || path.getName().equalsIgnoreCase("scribe_stats"))
                return false;
            return true;
        }
    }

    public void createListing(FileSystem fs, FileStatus fileStatus, Map<FileStatus, String> results,
            Set<FileStatus> trashSet, Table<String, String, String> checkpointPaths) throws IOException {
        List<FileStatus> streamsFileStatus = new ArrayList<FileStatus>();
        FileSystem srcFs = FileSystem.get(srcCluster.getHadoopConf());
        for (String stream : streamsToProcess) {
            streamsFileStatus.add(srcFs.getFileStatus(new Path(srcCluster.getDataDir(), stream)));
        }
        for (FileStatus stream : streamsFileStatus) {
            String streamName = stream.getPath().getName();
            LOG.debug("createListing working on Stream [" + streamName + "]");
            FileStatus[] collectors;
            try {
                collectors = fs.listStatus(stream.getPath());
            } catch (FileNotFoundException ex) {
                collectors = new FileStatus[0];
            }
            long minOfLatestCollectorTimeStamp = -1;
            for (FileStatus collector : collectors) {
                TreeMap<String, FileStatus> collectorPaths = new TreeMap<String, FileStatus>();
                // check point for this collector
                String collectorName = collector.getPath().getName();
                String checkPointKey = getCheckPointKey(this.getClass().getSimpleName(), streamName, collectorName);

                String checkPointValue = null;
                byte[] value = checkpointProvider.read(checkPointKey);
                if (value == null) {
                    // In case checkpointKey with newer name format is absent,read old
                    // checkpoint key
                    String oldCheckPointKey = streamName + collectorName;
                    value = checkpointProvider.read(oldCheckPointKey);
                }
                if (value != null)
                    checkPointValue = new String(value);
                LOG.debug("CheckPoint Key [" + checkPointKey + "] value [ " + checkPointValue + "]");
                FileStatus[] files = null;
                try {
                    files = fs.listStatus(collector.getPath(), new CollectorPathFilter());
                } catch (FileNotFoundException e) {
                }

                if (files == null) {
                    LOG.warn("No Files Found in the Collector " + collector.getPath() + " Skipping Directory");
                    continue;
                }
                TreeSet<FileStatus> sortedFiles = new TreeSet<FileStatus>(new FileTimeStampComparator());
                String currentFile = getCurrentFile(fs, files, sortedFiles);
                LOG.debug("last file " + currentFile + " in the collector directory " + collector.getPath());

                Iterator<FileStatus> it = sortedFiles.iterator();
                numberOfFilesProcessed = 0;
                long latestCollectorFileTimeStamp = -1;
                while (it.hasNext() && numberOfFilesProcessed < filesPerCollector) {
                    FileStatus file = it.next();
                    LOG.debug("Processing " + file.getPath());
                    /*
                     * fileTimeStamp value will be -1 for the files which are already processed
                     */
                    long fileTimeStamp = processFile(file, currentFile, checkPointValue, fs, results,
                            collectorPaths, streamName);
                    if (fileTimeStamp > latestCollectorFileTimeStamp) {
                        latestCollectorFileTimeStamp = fileTimeStamp;
                    }
                }
                populateTrash(collectorPaths, trashSet);
                populateCheckpointPathForCollector(checkpointPaths, collectorPaths);

                if ((latestCollectorFileTimeStamp < minOfLatestCollectorTimeStamp
                        || minOfLatestCollectorTimeStamp == -1) && latestCollectorFileTimeStamp != -1) {
                    minOfLatestCollectorTimeStamp = latestCollectorFileTimeStamp;
                }
            } // all files in a collector
            if (minOfLatestCollectorTimeStamp != -1) {
                lastProcessedFile.put(streamName, minOfLatestCollectorTimeStamp);
            } else {
                LOG.warn("No new files in " + streamName + " stream");
            }
        }
    }

    /*
     * This getter method is only for unit tests.
     */
    public long getLastAddedPartTime(String stream) {
        return lastAddedPartitionMap.get(stream);
    }

    private long processFile(FileStatus file, String currentFile, String checkPointValue, FileSystem fs,
            Map<FileStatus, String> results, Map<String, FileStatus> collectorPaths, String stream)
            throws IOException {
        long fileTimeStamp = -1;

        String fileName = file.getPath().getName();
        if (fileName != null && (!fileName.equalsIgnoreCase(currentFile) || processLastFile)) {
            String destDir = getCategoryJobOutTmpPath(stream).toString();
            if (aboveCheckpoint(checkPointValue, fileName)) {
                results.put(file, destDir);
                fileTimeStamp = CalendarHelper.getDateFromCollectorFileName(fileName);
                /*
                 * Depending on getLen() only for incrementing the
                 * number of data files count
                 */
                if (file.getLen() != 0) {
                    numberOfFilesProcessed++;
                }
            }
            collectorPaths.put(fileName, file);
        }
        return fileTimeStamp;
    }

    /*
     * Try reading a byte from a file to declare whether it's empty or not as
     * filesize isn't a right indicator in hadoop to say whether file has data or
     * not
     */
    private boolean isEmptyFile(FileStatus fileStatus, FileSystem fs) {
        boolean retVal = false;
        FSDataInputStream in = null;
        try {
            in = fs.open(fileStatus.getPath());
            byte[] data = new byte[1];
            // try reading 1 byte
            int bytesRead = in.read(data);
            if (bytesRead == 1) {
                // not empty file
                retVal = false;
            } else {
                // not able to read 1 bytes also then empty file
                retVal = true;
            }
        } catch (IOException e) {
            LOG.error("Unable to find if file is empty or not [" + fileStatus.getPath() + "]", e);
        } finally {
            if (in != null) {
                try {
                    in.close();
                } catch (IOException e1) {
                    LOG.error("Error in closing file [" + fileStatus.getPath() + "]", e1);
                }
            }
        }
        return retVal;
    }

    private void populateCheckpointPathForCollector(Table<String, String, String> checkpointPaths,
            TreeMap<String, FileStatus> collectorPaths) {
        // Last file in sorted ascending order to be check-pointed for this
        // collector
        if (collectorPaths != null && collectorPaths.size() > 0) {
            Entry<String, FileStatus> entry = collectorPaths.lastEntry();
            Path filePath = entry.getValue().getPath();
            String streamName = getCategoryFromSrcPath(filePath);
            String collectorName = filePath.getParent().getName();
            String checkpointPath = filePath.getName();
            checkpointPaths.put(streamName, collectorName, checkpointPath);
        }
    }

    private void populateTrash(Map<String, FileStatus> collectorPaths, Set<FileStatus> trashSet) {
        if (collectorPaths.size() <= FILES_TO_KEEP)
            return;
        else {
            // put collectorPaths.size() - FILES_TO_KEEP in trash path
            // in ascending order of creation
            Iterator<String> it = collectorPaths.keySet().iterator();
            int trashCnt = (collectorPaths.size() - FILES_TO_KEEP);
            int i = 0;
            while (it.hasNext() && i++ < trashCnt) {
                String fileName = it.next();
                trashSet.add(collectorPaths.get(fileName));
            }
        }
    }

    private boolean aboveCheckpoint(String checkPoint, String file) {
        if (checkPoint == null)
            return true;
        else if (file != null && file.compareTo(checkPoint) > 0) {
            return true;
        } else
            return false;
    }

    class FileTimeStampComparator implements Comparator<FileStatus> {
        public int compare(FileStatus file1, FileStatus file2) {
            long file1Time = file1.getModificationTime();
            long file2Time = file2.getModificationTime();
            if ((file1Time < file2Time))
                return -1;
            else
                return 1;

        }
    }

    /*
     * @returns null: if there are no files
     */
    protected String getCurrentFile(FileSystem fs, FileStatus[] files, TreeSet<FileStatus> sortedFiles) {
        // Proposed Algo :-> Sort files based on timestamp
        // if there are no files)
        // then null (implying process this file as non-current file)
        // else
        // return last file as the current file

        if (files == null || files.length == 0)
            return null;
        for (FileStatus file : files) {
            sortedFiles.add(file);
        }

        // get last file from set
        FileStatus lastFile = sortedFiles.last();
        long diff = (System.currentTimeMillis() - lastFile.getModificationTime()) / MILLISECONDS_IN_MINUTE;
        if (diff > timeoutToProcessLastCollectorFile) {
            processLastFile = true;
        } else {
            processLastFile = false;
        }
        return lastFile.getPath().getName();
    }

    private String getCategoryFromSrcPath(Path src) {
        return src.getParent().getParent().getName();
    }

    private Path getCategoryJobOutTmpPath(String category) {
        return new Path(tmpJobOutputPath, category);
    }

    protected void setBytesPerMapper(long bytesPerMapper) {
        BYTES_PER_MAPPER = bytesPerMapper;
    }

    /*
     * The visiblity of method is set to protected to enable unit testing
     */
    protected Job createJob(Path inputPath, long totalSize) throws IOException {
        String jobName = getName();
        Configuration conf = currentCluster.getHadoopConf();
        conf.set(ConduitConstants.AUDIT_ENABLED_KEY, System.getProperty(ConduitConstants.AUDIT_ENABLED_KEY));
        Job job = new Job(conf);
        job.setJobName(jobName);
        // DistributedCache.addFileToClassPath(inputFormatJarDestPath,
        // job.getConfiguration());
        job.getConfiguration().set("tmpjars",
                inputFormatJarDestPath.toString() + "," + auditUtilJarDestPath.toString());
        LOG.debug("Adding file [" + inputFormatJarDestPath + "] to distributed cache");
        job.setInputFormatClass(UniformSizeInputFormat.class);
        Class<? extends Mapper<Text, FileStatus, NullWritable, Text>> mapperClass = getMapperClass();
        job.setJarByClass(mapperClass);

        job.setMapperClass(mapperClass);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        // setting identity reducer
        job.setReducerClass(Reducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, tmpCounterOutputPath);
        job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
        job.getConfiguration().set(LOCALSTREAM_TMP_PATH, tmpPath.toString());
        job.getConfiguration().set(SRC_FS_DEFAULT_NAME_KEY, srcCluster.getHadoopConf().get(FS_DEFAULT_NAME_KEY));

        // set configurations needed for UniformSizeInputFormat
        int numMaps = getNumMapsForJob(totalSize);
        job.getConfiguration().setInt(DistCpConstants.CONF_LABEL_NUM_MAPS, numMaps);
        job.getConfiguration().setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalSize);
        job.getConfiguration().set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, inputPath.toString());
        LOG.info("Expected number of maps [" + numMaps + "] Total data size [" + totalSize + "]");

        return job;
    }

    private int getNumMapsForJob(long totalSize) {
        String mbPerMapper = System.getProperty(ConduitConstants.MB_PER_MAPPER);
        if (mbPerMapper != null) {
            BYTES_PER_MAPPER = Long.parseLong(mbPerMapper) * 1024 * 1024;
        }
        int numMaps = (int) Math.ceil(totalSize * 1.0 / BYTES_PER_MAPPER);
        if (numMaps == 0) {
            LOG.warn("number of maps were evaluated to zero. Making it as one ");
            numMaps = 1;
        }
        return numMaps;
    }

    /*
     * The visiblity of method is set to protected to enable unit testing
     */
    @SuppressWarnings("unchecked")
    protected Class<? extends Mapper<Text, FileStatus, NullWritable, Text>> getMapperClass() {
        String className = srcCluster.getCopyMapperImpl();
        if (className == null || className.isEmpty()) {
            return CopyMapper.class;
        } else {
            try {
                return (Class<? extends Mapper<Text, FileStatus, NullWritable, Text>>) Class.forName(className);
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Copy mapper Impl " + className + "is not found in class path");
            }
        }
    }

    public Cluster getCurrentCluster() {
        return currentCluster;
    }

    protected String getTier() {
        return "LOCAL";
    }

    /*
     * Find the topic name from path of format
     * /conduit/streams_local/<streamName>/2013/10/
     * 01/09/17/<collectorName>-<streamName>-2013-10-01-09-13_00000.gz
     */
    public String getTopicNameFromDestnPath(Path destnPath) {
        String destnPathAsString = destnPath.toString();
        String destnDirAsString = new Path(srcCluster.getLocalFinalDestDirRoot()).toString();
        String pathWithoutRoot = destnPathAsString.substring(destnDirAsString.length());
        Path tmpPath = new Path(pathWithoutRoot);
        while (tmpPath.depth() != 1)
            tmpPath = tmpPath.getParent();
        return tmpPath.getName();
    }

    @Override
    public String getServiceType() {
        return "LocalStreamService";
    }

    protected Path getFinalPath(long time, String stream) {
        Path finalDestPath = null;
        try {
            finalDestPath = new Path(srcCluster.getLocalDestDir(stream, time));
        } catch (IOException e) {
            LOG.error("Got exception while constructing a path from time ", e);
        }
        return finalDestPath;
    }

}