com.inmobi.databus.readers.DatabusStreamWaitingReader.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.databus.readers.DatabusStreamWaitingReader.java

Source

package com.inmobi.databus.readers;

/*
 * #%L
 * messaging-client-databus
 * %%
 * Copyright (C) 2012 - 2014 InMobi
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.IOException;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import com.inmobi.databus.files.FileMap;
import com.inmobi.databus.files.HadoopStreamFile;
import com.inmobi.databus.partition.PartitionCheckpoint;
import com.inmobi.databus.partition.PartitionCheckpointList;
import com.inmobi.databus.partition.PartitionId;
import com.inmobi.messaging.Message;
import com.inmobi.messaging.metrics.PartitionReaderStatsExposer;

public class DatabusStreamWaitingReader extends DatabusStreamReader<HadoopStreamFile> {

    private static final Log LOG = LogFactory.getLog(DatabusStreamWaitingReader.class);
    private final Set<Integer> partitionMinList;
    private final Map<Integer, CheckpointInfo> pChkpoints = new HashMap<Integer, CheckpointInfo>();
    private final Map<Integer, PartitionCheckpoint> deltaCheckpoint;
    private boolean createdDeltaCheckpointForFirstFile;
    private int currentMin;

    private static class CheckpointInfo {
        PartitionCheckpoint pck;
        Date timeStamp;
        boolean processed = false;
        boolean readFully = false;

        CheckpointInfo(PartitionCheckpoint pck) {
            this.pck = pck;
            if (pck != null) {
                this.timeStamp = getDateFromCheckpointPath(
                        ((HadoopStreamFile) pck.getStreamFile()).getCheckpointPath());
                this.readFully = (pck.getLineNum() == -1);
                // no need to process the full checkpoint and
                // checkpoint which has file name "null"
                if (readFully || (pck.getName() == null)) {
                    processed = true;
                }
            } else {
                // nothing to process
                processed = true;
            }
        }
    }

    public DatabusStreamWaitingReader(PartitionId partitionId, FileSystem fs, Path streamDir,
            String inputFormatClass, Configuration conf, long waitTimeForFileCreate,
            PartitionReaderStatsExposer metrics, boolean noNewFiles, Set<Integer> partitionMinList,
            PartitionCheckpointList partitionCheckpointList, Date stopTime) throws IOException {
        super(partitionId, fs, streamDir, inputFormatClass, conf, waitTimeForFileCreate, metrics, noNewFiles,
                stopTime);
        this.partitionMinList = partitionMinList;
        currentMin = -1;
        for (Integer min : partitionMinList) {
            CheckpointInfo cpi = new CheckpointInfo(partitionCheckpointList.getCheckpoints().get(min));
            pChkpoints.put(min, cpi);
        }
        deltaCheckpoint = new HashMap<Integer, PartitionCheckpoint>();
        createdDeltaCheckpointForFirstFile = false;
    }

    public PartitionCheckpoint getLeastCheckpoint() {
        CheckpointInfo least = null;
        Iterator<CheckpointInfo> it = pChkpoints.values().iterator();
        if (it.hasNext()) {
            least = it.next();
        }
        while (it.hasNext()) {
            CheckpointInfo tmp = it.next();
            if (tmp.timeStamp != null) {
                if (least.timeStamp == null || tmp.timeStamp.before(least.timeStamp)) {
                    least = tmp;
                }
            }
        }
        return least.pck;
    }

    /**
     * This method is used to check whether the given minute directory is
     * completely read or not. It takes the current time stamp and the minute
     * on which the reader is currently working. It retrieves the partition checkpoint
     * for that minute if it contains. It compares the current time stamp with
     * the checkpointed time stamp. If current time stamp is before the
     * checkpointed time stamp then that minute directory for the current hour is
     * completely read. If both the time stamps are same then it checks line number.
     * If line num is -1 means all the files in that minute dir are already read.
     */
    public boolean isRead(Date currentTimeStamp, int minute) {
        Date checkpointedTimestamp = pChkpoints.get(Integer.valueOf(minute)).timeStamp;
        if (checkpointedTimestamp == null) {
            return false;
        }
        if (currentTimeStamp.before(checkpointedTimestamp)) {
            return true;
        } else if (currentTimeStamp.equals(checkpointedTimestamp)) {
            return pChkpoints.get(Integer.valueOf(minute)).readFully;
        }
        return false;
    }

    @Override
    protected void buildListing(FileMap<HadoopStreamFile> fmap, PathFilter pathFilter) throws IOException {
        if (!setBuildTimeStamp(pathFilter)) {
            return;
        }
        Calendar current = Calendar.getInstance();
        Date now = current.getTime();
        current.setTime(buildTimestamp);
        while (current.getTime().before(now) && !closed) {
            // stop the file listing if stop date is beyond current time.
            if (checkAndSetstopTimeReached(current)) {
                break;
            }
            int min = current.get(Calendar.MINUTE);
            Date currenTimestamp = current.getTime();
            // Move the current minute to next minute
            current.add(Calendar.MINUTE, 1);
            if (partitionMinList.contains(Integer.valueOf(min)) && !isRead(currenTimestamp, min)) {
                Path dir = getMinuteDirPath(streamDir, currenTimestamp);
                if (fsIsPathExists(dir)) {
                    Path nextMinDir = getMinuteDirPath(streamDir, current.getTime());
                    if (fsIsPathExists(nextMinDir)) {
                        int numFilesInFileMap = fmap.getSize();
                        doRecursiveListing(dir, pathFilter, fmap);
                        if (numFilesInFileMap != 0 && numFilesInFileMap != fmap.getSize()) {
                            // stopping after listing two non empty directories
                            LOG.debug("Listing stopped after listing two non empty directories");
                            break;
                        } else if (fmap.isEmpty()) {
                            setLatestMinuteAlreadyRead(currenTimestamp);
                        }
                        updatePathBeingReadMetrics(currenTimestamp);
                    } else {
                        LOG.info("Reached end of file listing. Not looking at the last" + " minute directory:"
                                + dir);
                        break;
                    }
                }
            }
        }

        if (getFirstFileInStream() != null && (currentMin == -1)) {
            currentMin = getMinuteFromFile(getFirstFileInStream());
        }
    }

    private void updatePathBeingReadMetrics(Date currentTimeStamp) {
        if (getCurrentMin() != -1 && isUpdateRequired(currentTimeStamp)) {
            setLatestMinuteAlreadyRead(currentTimeStamp);
        }
    }

    private boolean isUpdateRequired(Date currentTimeStamp) {
        return currentTimeStamp.getTime() > getLatestMinuteAlreadyRead();
    }

    /*
     * check whether reached stopTime and stop the File listing if it reached stopTime
     */
    private boolean checkAndSetstopTimeReached(Calendar current) {
        if (stopTime != null && stopTime.before(current.getTime())) {
            LOG.info("Reached stopTime. Not listing from after the stop date ");
            stopListing();
            return true;
        }
        return false;
    }

    /**
     * This method does the required setup before moving to next file. First it
     * checks whether the both current file and next file belongs to same minute
     * or different minutes. If files exists on across minutes then it has to
     * check the next file is same as checkpointed file. If not same and checkpointed
     * file exists then sets the iterator to the checkpointed file.
     * @return false if it reads from the checkpointed file.
     * @throws InterruptedException 
     */
    @Override
    public boolean prepareMoveToNext(FileStatus currentFile, FileStatus nextFile)
            throws IOException, InterruptedException {
        Calendar next = Calendar.getInstance();
        Date nextFileTimeStamp = getDateFromStreamDir(streamDir, nextFile.getPath().getParent());
        next.setTime(nextFileTimeStamp);

        boolean readFromCheckpoint = false;
        if (currentMin != next.get(Calendar.MINUTE)) {
            if (currentFile != null) {
                Date currentFileTimeStamp = getDateFromStreamDir(streamDir, currentFile.getPath().getParent());
                setDeltaCheckpoint(getNextMinuteTimeStamp(currentFileTimeStamp), nextFileTimeStamp);
                // set the line number as -1 as current file was read fully.
                deltaCheckpoint.put(currentMin, new PartitionCheckpoint(getStreamFile(currentFile), -1));
            }
            // move to next file
            currentMin = next.get(Calendar.MINUTE);
            readFromCheckpoint = moveToCheckpoint(nextFile);
        } else {
            this.currentFile = nextFile;
        }
        setIterator();
        return !readFromCheckpoint;
    }

    private boolean moveToCheckpoint(FileStatus fileToRead) throws IOException, InterruptedException {
        boolean ret = false;
        CheckpointInfo cpi = pChkpoints.get(currentMin);
        if (!cpi.processed) {
            cpi.processed = true;
            PartitionCheckpoint partitionCheckpoint = cpi.pck;
            HadoopStreamFile sFile = (HadoopStreamFile) partitionCheckpoint.getStreamFile();
            if (sFile.getFileName() != null) {
                Path checkPointedFileName = new Path(streamDir, partitionCheckpoint.getFileName());
                //set iterator to checkpointed file if there is a checkpoint
                if (!fileToRead.getPath().equals(checkPointedFileName)) {
                    if (fsIsPathExists(checkPointedFileName)) {
                        fileToRead = fsGetFileStatus(checkPointedFileName);
                        currentLineNum = partitionCheckpoint.getLineNum();
                    } else {
                        LOG.info("Checkpointed file " + partitionCheckpoint.getFileName() + " does not exist");
                        build(getDateFromStreamDir(streamDir, fileToRead.getPath()));
                        startFromNextHigher(sFile);
                        return true;
                    }
                } else {
                    currentLineNum = partitionCheckpoint.getLineNum();
                }
                ret = true;
            }
        }
        this.currentFile = fileToRead;
        return ret;
    }

    private Date getNextMinuteTimeStamp(Date currentFileTimeStamp) {
        Calendar cal = Calendar.getInstance();
        cal.setTime(currentFileTimeStamp);
        cal.add(Calendar.MINUTE, 1);
        return cal.getTime();
    }

    /*
     * prepare a delta checkpoint
     */
    private void setDeltaCheckpoint(Date from, Date to) {
        Calendar cal = Calendar.getInstance();
        cal.setTime(from);
        while (cal.getTime().before(to)) {
            Integer currentMinute = cal.get(Calendar.MINUTE);
            if (partitionMinList.contains(currentMinute)) {
                // create a checkpoint for that minute only if it does not have
                // checkpoint or checkpoint file time stamp is older than
                // current file time stamp
                Date checkpointedTimeStamp = pChkpoints.get(currentMinute).timeStamp;
                if (checkpointedTimeStamp == null || !(checkpointedTimeStamp.after(cal.getTime()))) {
                    deltaCheckpoint.put(currentMinute, new PartitionCheckpoint(getStreamFile(cal.getTime()), -1));
                    pChkpoints.get(currentMinute).processed = true;
                }
            }
            cal.add(Calendar.MINUTE, 1);
        }
    }

    @Override
    protected HadoopStreamFile getStreamFile(Date timestamp) {
        return getHadoopStreamFile(streamDir, timestamp);
    }

    protected HadoopStreamFile getStreamFile(FileStatus status) {
        return getHadoopStreamFile(status);
    }

    protected void startFromNextHigher(HadoopStreamFile file) throws IOException, InterruptedException {
        if (!setNextHigherAndOpen(file)) {
            waitForNextFileCreation(file);
        }
    }

    private void waitForNextFileCreation(HadoopStreamFile file) throws IOException, InterruptedException {
        while (!closed && !setNextHigherAndOpen(file) && !hasReadFully()) {
            waitForFileCreate();
            build();
        }
    }

    protected void startFromNextHigher(FileStatus file) throws IOException, InterruptedException {
        if (!setNextHigherAndOpen(file)) {
            waitForNextFileCreation(file);
        }
    }

    private void waitForNextFileCreation(FileStatus file) throws IOException, InterruptedException {
        while (!closed && !setNextHigherAndOpen(file) && !hasReadFully()) {
            waitForFileCreate();
            build();
        }
    }

    @Override
    public Message readLine() throws IOException, InterruptedException {
        if (closed) {
            LOG.info("Stream closed");
            return null;
        }
        Message line = readNextLine();
        if (!createdDeltaCheckpointForFirstFile) {
            // prepare partition checkpoint for all minutes in the partitionMinList
            deltaCheckpoint.putAll(buildStartPartitionCheckpoints());
            setDeltaCheckpoint(startTimestamp == null ? buildTimestamp : startTimestamp,
                    getDateFromStreamDir(streamDir, getCurrentFile()));
            createdDeltaCheckpointForFirstFile = true;
        }
        while (line == null) { // reached end of file
            LOG.info("Read " + getCurrentFile() + " with lines:" + currentLineNum);
            if (closed) {
                LOG.info("Stream closed");
                break;
            }
            if (!nextFile()) { // reached end of file list
                LOG.info("could not find next file. Rebuilding");
                build(getDateFromStreamDir(streamDir, getCurrentFile()));

                if (!nextFile()) { // reached end of stream
                    // stop reading if read till stopTime
                    if (hasReadFully()) {
                        /* create  a delta checkpoint till stop time from last file timestamp
                         * Ex: If last file is in 2nd hr, 5th minute(02/05) and stop time is 02/10
                         * then we should have a delta checkpoint 02/05/file--1,
                         *  02/06/null--1 till 02/10/null--1
                         */
                        Date lastFileTimestamp = getDateFromStreamDir(streamDir, getCurrentFile());
                        if (stopTime != null) {
                            setDeltaCheckpoint(getNextMinuteTimeStamp(lastFileTimestamp),
                                    getNextMinuteTimeStamp(stopTime));
                            currentLineNum = -1;
                        }
                        LOG.info("read all files till stop date");
                        break;
                    }
                    LOG.info("Could not find next file");
                    startFromNextHigher(currentFile);
                    LOG.info("Reading from next higher file " + getCurrentFile());
                } else {
                    LOG.info("Reading from " + getCurrentFile() + " after rebuild");
                }
            } else {
                // read line from next file
                LOG.info("Reading from next file " + getCurrentFile());
            }
            line = readNextLine();
        }
        return line;
    }

    @Override
    protected FileMap<HadoopStreamFile> createFileMap() throws IOException {
        return new FileMap<HadoopStreamFile>() {
            @Override
            protected void buildList() throws IOException {
                buildListing(this, pathFilter);
            }

            @Override
            protected TreeMap<HadoopStreamFile, FileStatus> createFilesMap() {
                return new TreeMap<HadoopStreamFile, FileStatus>();
            }

            @Override
            protected HadoopStreamFile getStreamFile(String fileName) {
                throw new RuntimeException("Not implemented");
            }

            @Override
            protected HadoopStreamFile getStreamFile(FileStatus file) {
                return HadoopStreamFile.create(file);
            }

            @Override
            protected PathFilter createPathFilter() {
                return new PathFilter() {
                    @Override
                    public boolean accept(Path path) {
                        if (path.getName().startsWith("_")) {
                            return false;
                        }
                        return true;
                    }
                };
            }
        };
    }

    public static Date getBuildTimestamp(Path streamDir, PartitionCheckpoint partitionCheckpoint) {
        return getDateFromCheckpointPath(partitionCheckpoint.getFileName());
    }

    public static HadoopStreamFile getHadoopStreamFile(FileStatus status) {
        return HadoopStreamFile.create(status);
    }

    public static HadoopStreamFile getHadoopStreamFile(Path streamDirPath, Date timestamp) {
        return new HadoopStreamFile(getMinuteDirPath(streamDirPath, timestamp), null, null);
    }

    public Map<Integer, PartitionCheckpoint> getDeltaCheckpoint() {
        return deltaCheckpoint;
    }

    public int getCurrentMin() {
        return this.currentMin;
    }

    /**
     * @returns Zero  if checkpoint is not present for that minute or
     *                checkpoint file and current file were not same.
     *          Line number from checkpoint
     */
    @Override
    protected long getLineNumberForFirstFile(FileStatus firstFile) {
        int minute = getMinuteFromFile(firstFile);
        PartitionCheckpoint partitionChkPoint = pChkpoints.get(Integer.valueOf(minute)).pck;
        if (partitionChkPoint != null) {
            Path checkPointedFileName = new Path(streamDir, partitionChkPoint.getFileName());
            // check whether current file and checkpoint file are same
            if (checkPointedFileName.equals(firstFile.getPath())) {
                return partitionChkPoint.getLineNum();
            }
        }
        return 0;
    }

    private int getMinuteFromFile(FileStatus firstFile) {
        Date currentTimeStamp = getDateFromStreamDir(streamDir, firstFile.getPath().getParent());
        Calendar cal = Calendar.getInstance();
        cal.setTime(currentTimeStamp);
        return cal.get(Calendar.MINUTE);
    }

    public void resetDeltaCheckpoint() {
        deltaCheckpoint.clear();
    }

    public Map<Integer, PartitionCheckpoint> buildStartPartitionCheckpoints() {
        Map<Integer, PartitionCheckpoint> fullPartitionChkMap = new HashMap<Integer, PartitionCheckpoint>();
        if (buildTimestamp == null) {
            return fullPartitionChkMap;
        }
        Calendar cal = Calendar.getInstance();
        cal.setTime(buildTimestamp);
        cal.add(Calendar.MINUTE, 60);
        Date to = cal.getTime();
        cal.setTime(buildTimestamp);
        while (cal.getTime().before(to)) {
            Integer currentMinute = cal.get(Calendar.MINUTE);
            if (partitionMinList.contains(currentMinute)) {
                Date checkpointedTimeStamp = pChkpoints.get(currentMinute).timeStamp;
                // create a checkpoint for that minute only if it does not have
                // checkpoint so that for no minute the checkpoint is null
                if (checkpointedTimeStamp == null) {
                    fullPartitionChkMap.put(currentMinute,
                            new PartitionCheckpoint(getStreamFile(cal.getTime()), 0));
                    pChkpoints.get(currentMinute).processed = true;
                }
            }
            cal.add(Calendar.MINUTE, 1);
        }
        return fullPartitionChkMap;
    }

    public void startFromCheckPoint() throws IOException, InterruptedException {
        initCurrentFile();
        moveToCheckpoint(getFirstFileInStream());
        if (currentFile != null) {
            LOG.debug("CurrentFile:" + getCurrentFile() + " currentLineNum:" + currentLineNum);
            setIterator();
        }
    }

    /*
     * This method is for getting the time stamp from databus collector stream file.
     */
    @Override
    protected Date getTimeStampFromCollectorStreamFile(FileStatus file) {
        throw new UnsupportedOperationException("Can not be supported ");
    }
}