com.inmobi.databus.purge.DataPurgerService.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.databus.purge.DataPurgerService.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.inmobi.databus.purge;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.inmobi.databus.AbstractService;
import com.inmobi.databus.Cluster;
import com.inmobi.databus.DatabusConfig;
import com.inmobi.databus.DatabusConfigParser;
import com.inmobi.databus.DestinationStream;
import com.inmobi.databus.SourceStream;
import com.inmobi.databus.utils.CalendarHelper;

/*
 * Assumptions
 * (i) One data Purger Service for a cluster
 */

public class DataPurgerService extends AbstractService {
    private static final Log LOG = LogFactory.getLog(DataPurgerService.class);

    private final Cluster cluster;
    private final FileSystem fs;
    private final Integer defaulttrashPathRetentioninHours;
    private final Integer defaultstreamPathRetentioninHours;
    private Map<String, Integer> streamRetention;
    private Set<Path> streamsToPurge;
    private DateFormat dateFormat = new SimpleDateFormat("yyyy:MM:dd:HH:mm");
    private static long MILLISECONDS_PER_HOUR = 60 * 60 * 1000;

    public DataPurgerService(DatabusConfig databusConfig, Cluster cluster) throws Exception {
        super(DataPurgerService.class.getName(), databusConfig, 60000 * 60);
        this.cluster = cluster;
        fs = FileSystem.get(cluster.getHadoopConf());
        this.defaulttrashPathRetentioninHours = new Integer(
                Integer.parseInt(databusConfig.getDefaults().get(DatabusConfigParser.TRASH_RETENTION_IN_HOURS)));
        this.defaultstreamPathRetentioninHours = new Integer(
                Integer.parseInt(databusConfig.getDefaults().get(DatabusConfigParser.RETENTION_IN_HOURS)));
    }

    @Override
    public void stop() {
        stopped = true;
        /*
         * Pruger can sleep for an hour so it needs to be interuppted
         */
        thread.interrupt();
        LOG.info(Thread.currentThread().getName() + " stopped [" + stopped + "]");
    }

    @Override
    public long getMSecondsTillNextRun(long currentTime) {
        return runIntervalInMsec;
    }

    private void addMergedStreams() {
        Map<String, DestinationStream> destinationStreamMapStreamMap = cluster.getDestinationStreams();
        Set<Map.Entry<String, DestinationStream>> entrySet = destinationStreamMapStreamMap.entrySet();
        Iterator it = entrySet.iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            String streamName = (String) entry.getKey();
            DestinationStream consumeStream = (DestinationStream) entry.getValue();
            Integer mergedStreamRetentionInHours = consumeStream.getRetentionInHours();
            LOG.debug("Merged Stream :: streamName [" + streamName + "] mergedStreamRetentionInHours ["
                    + mergedStreamRetentionInHours + "]");
            if (streamRetention.get(streamName) == null) {
                streamRetention.put(streamName, mergedStreamRetentionInHours);
                LOG.debug("Adding Merged Stream [" + streamName + "] retentionInHours ["
                        + mergedStreamRetentionInHours + "]");
            } else {
                // Partial & Merged stream are produced at this cluster
                // choose max retention period
                Integer partialStreamRetentionInHours = streamRetention.get(streamName);
                if (partialStreamRetentionInHours.compareTo(mergedStreamRetentionInHours) > 0) {
                    streamRetention.put(streamName, partialStreamRetentionInHours);
                    LOG.debug("Overriding Stream [" + streamName + "] retentionInHours ["
                            + partialStreamRetentionInHours + "]");

                } else {
                    streamRetention.put(streamName, mergedStreamRetentionInHours);
                    LOG.debug("Overriding Stream [" + streamName + "] retentionInHours ["
                            + mergedStreamRetentionInHours + "]");

                }

            }
        }
    }

    private void addLocalStreams() {
        for (SourceStream s : getConfig().getSourceStreams().values()) {
            if (s.getSourceClusters().contains(cluster.getName())) {
                String streamName = s.getName();
                Integer retentionInHours = new Integer(s.getRetentionInHours(cluster.getName()));
                streamRetention.put(streamName, retentionInHours);
                LOG.debug("Adding Partial Stream [" + streamName + "] with retentionPeriod [" + retentionInHours
                        + " Hours]");
            }
        }
    }

    private Integer getDefaultStreamRetentionInHours() {
        return defaultstreamPathRetentioninHours;
    }

    private Integer getTrashPathRetentionInHours() {
        return defaulttrashPathRetentioninHours;
    }

    Integer getRetentionPeriod(String streamName) {
        Integer retentionInHours = streamRetention.get(streamName);
        if (retentionInHours == null)
            return getDefaultStreamRetentionInHours();
        return retentionInHours;
    }

    private long getMsecInDay() {
        return 1000 * 60 * 60 * 24; // 1 day
    }

    @Override
    protected void execute() throws Exception {
        try {
            streamRetention = new HashMap<String, Integer>();
            streamsToPurge = new TreeSet<Path>();

            // populates - streamRetention
            // Map of streams and their retention period at this cluster (Partial +
            // Merged)
            // Partial streams produced at this cluster - retention period config
            addLocalStreams();
            // Merged streams at this cluster - retention period config
            addMergedStreams();
            String mergedStreamRoot = cluster.getFinalDestDirRoot();
            Map<String, Path> mergedStreamsInClusterPathMap = getStreamsInCluster(mergedStreamRoot);
            String localStreamRoot = cluster.getLocalFinalDestDirRoot();
            Map<String, Path> localStreamsInClusterPathMap = getStreamsInCluster(localStreamRoot);
            getPathsToPurge(mergedStreamsInClusterPathMap, localStreamsInClusterPathMap);
            purge();
        } catch (Exception e) {
            LOG.warn(e);
            e.printStackTrace();
            throw new Exception(e);
        }
    }

    private void getPathsToPurge(Map<String, Path> mergedStreamsInClusterPathMap,
            Map<String, Path> localStreamsInClusterPathMap) throws Exception {
        getStreamsPathToPurge(mergedStreamsInClusterPathMap);
        getStreamsPathToPurge(localStreamsInClusterPathMap);
        getTrashPathsToPurge();

    }

    private void getTrashPathsToPurge() throws Exception {
        Path trashRoot = cluster.getTrashPath();
        LOG.debug("Looking for trashPaths in [" + trashRoot + "]");
        FileStatus[] trashDatePaths = getAllFilesInDir(trashRoot, fs);
        // For each trashpath
        if (trashDatePaths != null && trashDatePaths.length >= 1) {
            for (FileStatus trashPath : trashDatePaths) {
                FileStatus[] trashHourPaths = getAllFilesInDir(trashPath.getPath(), fs);
                if (trashHourPaths != null && trashHourPaths.length >= 1) {
                    for (FileStatus trashHourPath : trashHourPaths) {
                        try {
                            Calendar trashPathDate = getDateFromTrashPath(trashPath.getPath().getName(),
                                    trashHourPath.getPath().getName());
                            if (isPurge(trashPathDate, getTrashPathRetentionInHours()))
                                streamsToPurge.add(trashHourPath.getPath().makeQualified(fs));
                        } catch (NumberFormatException e) {
                            streamsToPurge.add(trashHourPath.getPath().makeQualified(fs));
                        }
                    }
                }

            }
        }

    }

    private Calendar getDateFromTrashPath(String trashDatePath, String trashHourPath) {
        // Eg: TrashPath :: 2012-1-9
        String[] date = trashDatePath.split("-");
        String year = date[0];
        String month = date[1];
        String day = date[2];
        return CalendarHelper.getDateHour(year, month, day, trashHourPath);

    }

    private Map<String, Path> getStreamsInCluster(String root) throws Exception {
        Map<String, Path> streams = new HashMap<String, Path>();
        LOG.debug("Find streams in [" + root + "]");
        FileStatus[] paths = getAllFilesInDir(new Path(root), fs);
        if (paths != null) {
            for (FileStatus fileStatus : paths) {
                streams.put(fileStatus.getPath().getName(), fileStatus.getPath().makeQualified(fs));
                LOG.debug("Purger working for stream [" + fileStatus.getPath() + "]");
            }
        } else
            LOG.debug("No streams found in [" + root + "]");
        return streams;
    }

    private void getStreamsPathToPurge(Map<String, Path> streamPathMap) throws Exception {
        Set<Map.Entry<String, Path>> streamsToProcess = streamPathMap.entrySet();
        Iterator it = streamsToProcess.iterator();
        while (it.hasNext()) {
            Map.Entry<String, Path> entry = (Map.Entry<String, Path>) it.next();
            String streamName = entry.getKey();
            Path streamRootPath = entry.getValue();
            LOG.debug(
                    "Find Paths to purge for stream [" + streamName + "] streamRootPath [" + streamRootPath + "]");
            // For each Stream, all years
            FileStatus[] years = getAllFilesInDir(streamRootPath, fs);
            if (years != null) {
                for (FileStatus year : years) {
                    // For each month
                    FileStatus[] months = getAllFilesInDir(year.getPath(), fs);
                    if (months != null && months.length >= 1) {
                        for (FileStatus month : months) {
                            // For each day
                            FileStatus[] days = getAllFilesInDir(month.getPath(), fs);
                            if (days != null && days.length >= 1) {
                                for (FileStatus day : days) {
                                    // For each day
                                    FileStatus[] hours = getAllFilesInDir(day.getPath(), fs);
                                    if (hours != null && hours.length >= 1) {
                                        for (FileStatus hour : hours) {
                                            LOG.debug("Working for hour [" + hour.getPath() + "]");
                                            Calendar streamDate = CalendarHelper.getDateHour(
                                                    year.getPath().getName(), month.getPath().getName(),
                                                    day.getPath().getName(), hour.getPath().getName());
                                            LOG.debug("Validate [" + streamDate.toString()
                                                    + "] against retentionHours [" + getRetentionPeriod(streamName)
                                                    + "]");
                                            if (isPurge(streamDate, getRetentionPeriod(streamName))) {
                                                LOG.debug("Adding stream to purge [" + hour.getPath() + "]");
                                                streamsToPurge.add(hour.getPath().makeQualified(fs));
                                            }
                                        }
                                    } else {
                                        // No hour found in day. Purge day
                                        streamsToPurge.add(day.getPath().makeQualified(fs));
                                    }
                                } // each day
                            } else {
                                // No day found in month. Purge month
                                streamsToPurge.add(month.getPath().makeQualified(fs));
                            }
                        } // each month
                    } else {
                        // no months found in year. Purge Year.
                        streamsToPurge.add(year.getPath().makeQualified(fs));
                    }
                } // each year
            }
        } // each stream
    }

    public boolean isPurge(Calendar streamDate, Integer retentionPeriodinHours) {
        // int streamDay = streamDate.get(Calendar.DAY_OF_MONTH);
        Calendar nowTime = CalendarHelper.getNowTime();
        String streamDateStr = dateFormat.format(new Date(streamDate.getTimeInMillis()));
        String nowTimeStr = dateFormat.format(new Date(nowTime.getTimeInMillis()));

        LOG.debug("streamDate [" + streamDateStr + "] currentDate : [" + nowTimeStr + "] against retention ["
                + retentionPeriodinHours + " Hours]");

        LOG.debug("Hours between streamDate and nowTime is [" + getHoursBetweenDates(streamDate, nowTime)
                + " Hours]");
        if (getHoursBetweenDates(streamDate, nowTime) < retentionPeriodinHours)
            return false;
        else
            return true;
    }

    private int getHoursBetweenDates(Calendar startDate, Calendar endDate) {
        long diff = endDate.getTimeInMillis() - startDate.getTimeInMillis();
        int hours = (int) Math.floor(diff / MILLISECONDS_PER_HOUR);
        return Math.abs(hours);
    }

    private void purge() throws Exception {
        Iterator it = streamsToPurge.iterator();
        while (it.hasNext()) {
            Path purgePath = (Path) it.next();
            fs.delete(purgePath, true);
            LOG.info("Purging [" + purgePath + "]");
        }
    }

    private FileStatus[] getAllFilesInDir(Path dir, FileSystem fs) throws Exception {
        return fs.listStatus(dir);
    }
}