com.inmobi.conduit.purge.DataPurgerService.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.conduit.purge.DataPurgerService.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.inmobi.conduit.purge;

import java.io.FileNotFoundException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import com.inmobi.conduit.utils.CalendarHelper;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.thrift.TException;

import com.inmobi.conduit.metrics.ConduitMetrics;
import com.inmobi.conduit.AbstractService;
import com.inmobi.conduit.Cluster;
import com.inmobi.conduit.Conduit;
import com.inmobi.conduit.ConduitConfig;
import com.inmobi.conduit.ConduitConfigParser;
import com.inmobi.conduit.DestinationStream;
import com.inmobi.conduit.SourceStream;

/*
 * Assumptions
 * (i) One data Purger Service for a cluster
 */

/*
 * Set<HCatPartitionDesc> set = new TreeSet<>();
 */
public class DataPurgerService extends AbstractService {
    private static final Log LOG = LogFactory.getLog(DataPurgerService.class);

    private final Cluster cluster;
    private final FileSystem fs;
    private final Integer defaulttrashPathRetentioninHours;
    private final Integer defaultstreamPathRetentioninHours;
    private Map<String, Integer> streamRetention;
    private Set<Path> streamsToPurge;
    private Map<Path, PartitionDesc> pathPartitionDescMap;
    private DateFormat dateFormat = new SimpleDateFormat("yyyy:MM:dd:HH:mm");
    private static long MILLISECONDS_PER_HOUR = 60 * 60 * 1000;
    private final static String PURGEPATHS_COUNT = "purgePaths.count";
    private final static String DELETE_FAILURES_COUNT = "deleteFailures.count";
    private final static String HCAT_PURGE_PARTITION_FAILURES_COUNT = "hcat.paritition.drop.failures.count";
    private static final String HCAT_PURGED_PARTITION_COUNT = "hcat.partition.purged.count";

    public DataPurgerService(ConduitConfig conduitConfig, Cluster cluster) throws Exception {
        super(DataPurgerService.class.getName(), conduitConfig, 60000 * 60, null, new HashSet<String>());
        this.cluster = cluster;
        fs = FileSystem.get(cluster.getHadoopConf());
        this.defaulttrashPathRetentioninHours = new Integer(
                Integer.parseInt(conduitConfig.getDefaults().get(ConduitConfigParser.TRASH_RETENTION_IN_HOURS)));
        this.defaultstreamPathRetentioninHours = new Integer(
                Integer.parseInt(conduitConfig.getDefaults().get(ConduitConfigParser.RETENTION_IN_HOURS)));
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), PURGEPATHS_COUNT, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), DELETE_FAILURES_COUNT, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), RUNTIME, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), FAILURES, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_PURGE_PARTITION_FAILURES_COUNT, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_CONNECTION_FAILURES, getName());
        ConduitMetrics.registerSlidingWindowGauge(getServiceType(), HCAT_PURGED_PARTITION_COUNT, getName());

        if (Conduit.isHCatEnabled()) {
            Map<String, DestinationStream> destMap = cluster.getDestinationStreams();
            for (Map.Entry<String, DestinationStream> entry : destMap.entrySet()) {
                streamHcatEnableMap.put(entry.getKey(), entry.getValue().isHCatEnabled());
            }
            Map<String, SourceStream> sourceStreams = config.getSourceStreams();
            for (Map.Entry<String, SourceStream> entry : sourceStreams.entrySet()) {
                streamHcatEnableMap.put(entry.getKey(), entry.getValue().isHCatEnabled());
            }
        }
    }

    private class PartitionDesc {
        String streamName;
        String tableName;
        String location;
        Map<String, String> partSpec;
        String filterString;

        PartitionDesc(String stream, String table, String pathLocation, Map<String, String> parttitionSpec,
                String filter) {
            streamName = stream;
            tableName = table;
            location = pathLocation;
            partSpec = parttitionSpec;
            filterString = filter;
        }

        public String getLocation() {
            return location;
        }

        public String getFilterString() {
            return filterString;
        }

        public String getStreamName() {
            return streamName;
        }

        public String getTableName() {
            return tableName;
        }

        public Map<String, String> getPartSpec() {
            return partSpec;
        }
    }

    @Override
    public void stop() {
        stopped = true;
        /*
         * Pruger can sleep for an hour so it needs to be interuppted
         */
        thread.interrupt();
        LOG.info(Thread.currentThread().getName() + " stopped [" + stopped + "]");
    }

    @Override
    public long getMSecondsTillNextRun(long currentTime) {
        return runIntervalInMsec;
    }

    private void addMergedStreams() {
        Map<String, DestinationStream> destinationStreamMapStreamMap = cluster.getDestinationStreams();
        Set<Map.Entry<String, DestinationStream>> entrySet = destinationStreamMapStreamMap.entrySet();
        Iterator it = entrySet.iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            String streamName = (String) entry.getKey();
            DestinationStream consumeStream = (DestinationStream) entry.getValue();
            Integer mergedStreamRetentionInHours = consumeStream.getRetentionInHours();

            LOG.debug("Merged Stream :: streamName [" + streamName + "] mergedStreamRetentionInHours ["
                    + mergedStreamRetentionInHours + "]");
            if (streamRetention.get(streamName) == null) {
                streamRetention.put(streamName, mergedStreamRetentionInHours);
                LOG.debug("Adding Merged Stream [" + streamName + "] retentionInHours ["
                        + mergedStreamRetentionInHours + "]");
            } else {
                // Partial & Merged stream are produced at this cluster
                // choose max retention period
                Integer partialStreamRetentionInHours = streamRetention.get(streamName);
                if (partialStreamRetentionInHours.compareTo(mergedStreamRetentionInHours) > 0) {
                    streamRetention.put(streamName, partialStreamRetentionInHours);
                    LOG.debug("Overriding Stream [" + streamName + "] retentionInHours ["
                            + partialStreamRetentionInHours + "]");

                } else {
                    streamRetention.put(streamName, mergedStreamRetentionInHours);
                    LOG.debug("Overriding Stream [" + streamName + "] retentionInHours ["
                            + mergedStreamRetentionInHours + "]");

                }

            }
        }
    }

    private void addLocalStreams() {
        for (SourceStream s : getConfig().getSourceStreams().values()) {
            if (s.getSourceClusters().contains(cluster.getName())) {
                String streamName = s.getName();
                Integer retentionInHours = new Integer(s.getRetentionInHours(cluster.getName()));
                streamRetention.put(streamName, retentionInHours);
                LOG.debug("Adding Partial Stream [" + streamName + "] with retentionPeriod [" + retentionInHours
                        + " Hours]");
            }
        }
    }

    private Integer getDefaultStreamRetentionInHours() {
        return defaultstreamPathRetentioninHours;
    }

    private Integer getTrashPathRetentionInHours() {
        return defaulttrashPathRetentioninHours;
    }

    Integer getRetentionPeriod(String streamName) {
        Integer retentionInHours = streamRetention.get(streamName);
        if (retentionInHours == null)
            return getDefaultStreamRetentionInHours();
        return retentionInHours;
    }

    @Override
    protected void execute() throws Exception {
        try {
            streamRetention = new HashMap<String, Integer>();
            streamsToPurge = new TreeSet<Path>();
            pathPartitionDescMap = new HashMap<Path, PartitionDesc>();

            // populates - streamRetention
            // Map of streams and their retention period at this cluster (Partial +
            // Merged)
            // Partial streams produced at this cluster - retention period config
            addLocalStreams();
            // Merged streams at this cluster - retention period config
            addMergedStreams();
            String mergedStreamRoot = cluster.getFinalDestDirRoot();
            Map<String, Path> mergedStreamsInClusterPathMap = getStreamsInCluster(mergedStreamRoot);
            String localStreamRoot = cluster.getLocalFinalDestDirRoot();
            Map<String, Path> localStreamsInClusterPathMap = getStreamsInCluster(localStreamRoot);
            getPathsToPurge(mergedStreamsInClusterPathMap, localStreamsInClusterPathMap);
            purge();
        } catch (Exception e) {
            LOG.warn(e);
            e.printStackTrace();
            throw new Exception(e);
        }
    }

    public void clearStreamHCatEnableMap() {
        streamHcatEnableMap.clear();
    }

    private void getPathsToPurge(Map<String, Path> mergedStreamsInClusterPathMap,
            Map<String, Path> localStreamsInClusterPathMap) throws Exception {
        getStreamsPathToPurge(mergedStreamsInClusterPathMap, false);
        getStreamsPathToPurge(localStreamsInClusterPathMap, true);
        getTrashPathsToPurge();

    }

    private void getTrashPathsToPurge() throws Exception {
        Path trashRoot = cluster.getTrashPath();
        LOG.debug("Looking for trashPaths in [" + trashRoot + "]");
        FileStatus[] trashDatePaths = getAllFilesInDir(trashRoot, fs);
        // For each trashpath
        if (trashDatePaths != null && trashDatePaths.length >= 1) {
            for (FileStatus trashPath : trashDatePaths) {
                FileStatus[] trashHourPaths = getAllFilesInDir(trashPath.getPath(), fs);
                if (trashHourPaths != null && trashHourPaths.length >= 1) {
                    for (FileStatus trashHourPath : trashHourPaths) {
                        try {
                            Calendar trashPathHourDate = getDateFromTrashPath(trashPath.getPath().getName(),
                                    trashHourPath.getPath().getName());
                            if (isPurge(trashPathHourDate, getTrashPathRetentionInHours())) {
                                streamsToPurge.add(trashHourPath.getPath().makeQualified(fs));
                            }
                        } catch (NumberFormatException e) {
                            streamsToPurge.add(trashHourPath.getPath().makeQualified(fs));
                        }
                    }
                } else {
                    try {
                        /*
                         * The date direcotry is empty. Check the time elapsed between the
                         * last hour of the empty directory (23rd hour) and the current time
                         * and add the date direcotry to the streamsToPurge only if the time
                         * elapsed is greater than the trashPathRetiontionHours
                         */
                        Calendar trashPathDate = getDateFromTrashPath(trashPath.getPath().getName(), "23");
                        if (isPurge(trashPathDate, getTrashPathRetentionInHours()))
                            streamsToPurge.add(trashPath.getPath().makeQualified(fs));
                    } catch (NumberFormatException e) {
                        streamsToPurge.add(trashPath.getPath().makeQualified(fs));
                    }
                }
            }
        }
    }

    private Calendar getDateFromTrashPath(String trashDatePath, String trashHourPath) {
        // Eg: TrashPath :: 2012-1-9
        String[] date = trashDatePath.split("-");
        String year = date[0];
        String month = date[1];
        String day = date[2];

        return CalendarHelper.getDateHour(year, month, day, trashHourPath);

    }

    private Map<String, Path> getStreamsInCluster(String root) throws Exception {
        Map<String, Path> streams = new HashMap<String, Path>();
        LOG.debug("Find streams in [" + root + "]");
        FileStatus[] paths = getAllFilesInDir(new Path(root), fs);
        if (paths != null) {
            for (FileStatus fileStatus : paths) {
                streams.put(fileStatus.getPath().getName(), fileStatus.getPath().makeQualified(fs));
                LOG.debug("Purger working for stream [" + fileStatus.getPath() + "]");
            }
        } else
            LOG.debug("No streams found in [" + root + "]");
        return streams;
    }

    private void getStreamsPathToPurge(Map<String, Path> streamPathMap, boolean isLocal) throws Exception {
        Set<Map.Entry<String, Path>> streamsToProcess = streamPathMap.entrySet();
        Iterator it = streamsToProcess.iterator();
        while (it.hasNext()) {
            Map.Entry<String, Path> entry = (Map.Entry<String, Path>) it.next();
            String streamName = entry.getKey();
            Path streamRootPath = entry.getValue();
            String tableName = null;
            if (isLocal) {
                tableName = LOCAL_TABLE_PREFIX + "_" + streamName;
            } else {
                tableName = TABLE_PREFIX + "_" + streamName;
            }
            LOG.debug(
                    "Find Paths to purge for stream [" + streamName + "] streamRootPath [" + streamRootPath + "]");
            // For each Stream, all years
            FileStatus[] years = getAllFilesInDir(streamRootPath, fs);
            if (years != null) {
                for (FileStatus year : years) {
                    String yearVal = year.getPath().getName();
                    // For each month
                    FileStatus[] months = getAllFilesInDir(year.getPath(), fs);
                    if (months != null && months.length >= 1) {
                        for (FileStatus month : months) {
                            String monthVal = month.getPath().getName();
                            // For each day
                            FileStatus[] days = getAllFilesInDir(month.getPath(), fs);
                            if (days != null && days.length >= 1) {
                                for (FileStatus day : days) {
                                    String dayVal = day.getPath().getName();
                                    // For each day
                                    FileStatus[] hours = getAllFilesInDir(day.getPath(), fs);
                                    if (hours != null && hours.length >= 1) {
                                        for (FileStatus hour : hours) {
                                            LOG.debug("Working for hour [" + hour.getPath() + "]");

                                            String hourVal = hour.getPath().getName();
                                            Calendar streamDate = CalendarHelper.getDateHour(yearVal, monthVal,
                                                    dayVal, hourVal);
                                            LOG.debug("Validate [" + streamDate.toString()
                                                    + "] against retentionHours [" + getRetentionPeriod(streamName)
                                                    + "]");
                                            if (isPurge(streamDate, getRetentionPeriod(streamName))) {
                                                LOG.debug("Adding stream to purge [" + hour.getPath() + "]");
                                                Path hourPath = hour.getPath().makeQualified(fs);
                                                addPartitionToList(streamName, tableName, hourPath, yearVal,
                                                        monthVal, dayVal, hourVal);
                                                streamsToPurge.add(hourPath);
                                            }
                                        }
                                    } else {
                                        Path dayPath = day.getPath().makeQualified(fs);
                                        // No hour found in day. Purge day
                                        addPartitionToList(streamName, tableName, dayPath, yearVal, monthVal,
                                                dayVal);
                                        streamsToPurge.add(dayPath);
                                    }
                                } // each day
                            } else {
                                // No day found in month. Purge month
                                Path monthPath = month.getPath().makeQualified(fs);
                                addPartitionToList(streamName, tableName, monthPath, yearVal, monthVal);
                                streamsToPurge.add(monthPath);
                            }
                        } // each month
                    } else {
                        // no months found in year. Purge Year.
                        Path yearPath = year.getPath().makeQualified(fs);
                        addPartitionToList(streamName, tableName, yearPath, yearVal);
                        streamsToPurge.add(year.getPath().makeQualified(fs));
                    }
                } // each year
            }
        } // each stream
    }

    private boolean isHCatEnabledStream(String streamName) {
        if (streamHcatEnableMap.containsKey(streamName)) {
            return streamHcatEnableMap.get(streamName);
        }
        return false;
    }

    private void addPartitionToList(String streamName, String tableName, Path yearPath, String yearVal) {
        addPartitionToList(streamName, tableName, yearPath, yearVal, null);
    }

    private void addPartitionToList(String streamName, String tableName, Path monthPath, String yearVal,
            String monthVal) {
        addPartitionToList(streamName, tableName, monthPath, yearVal, monthVal, null);
    }

    private void addPartitionToList(String streamName, String tableName, Path dayPath, String yearVal,
            String monthVal, String dayVal) {
        addPartitionToList(streamName, tableName, dayPath, yearVal, monthVal, dayVal, null);
    }

    private void addPartitionToList(String streamName, String tableName, Path hourPath, String yearVal,
            String monthVal, String dayVal, String hourVal) {
        if (!isHCatEnabledStream(streamName)) {
            return;
        }
        Map<String, String> partSpec = new HashMap<String, String>();
        partSpec.put(YEAR_PARTITION_NAME, yearVal);
        if (monthVal != null) {
            partSpec.put(MONTH_PARTITION_NAME, monthVal);
        }
        if (dayVal != null) {
            partSpec.put(DAY_PARTITION_NAME, dayVal);
        }
        if (hourVal != null) {
            partSpec.put(HOUR_PARTITION_NAME, hourVal);
        }
        String filterString = getFilterString(partSpec);
        PartitionDesc partDesc = new PartitionDesc(streamName, tableName, hourPath.toString(), partSpec,
                filterString);
        pathPartitionDescMap.put(hourPath, partDesc);
    }

    public boolean isPurge(Calendar streamDate, Integer retentionPeriodinHours) {
        // int streamDay = streamDate.get(Calendar.DAY_OF_MONTH);
        Calendar nowTime = CalendarHelper.getNowTime();
        String streamDateStr = dateFormat.format(new Date(streamDate.getTimeInMillis()));
        String nowTimeStr = dateFormat.format(new Date(nowTime.getTimeInMillis()));

        LOG.debug("streamDate [" + streamDateStr + "] currentDate : [" + nowTimeStr + "] against retention ["
                + retentionPeriodinHours + " Hours]");

        LOG.debug("Hours between streamDate and nowTime is [" + getHoursBetweenDates(streamDate, nowTime)
                + " Hours]");
        if (getHoursBetweenDates(streamDate, nowTime) < retentionPeriodinHours)
            return false;
        else
            return true;
    }

    private int getHoursBetweenDates(Calendar startDate, Calendar endDate) {
        long diff = endDate.getTimeInMillis() - startDate.getTimeInMillis();
        int hours = (int) Math.floor(diff / MILLISECONDS_PER_HOUR);
        return Math.abs(hours);
    }

    private void purge() throws HiveException, InterruptedException {
        Iterator it = streamsToPurge.iterator();
        Path purgePath = null;
        while (it.hasNext()) {
            purgePath = (Path) it.next();
            if (pathPartitionDescMap.containsKey(purgePath)) {
                PartitionDesc partDesc = pathPartitionDescMap.get(purgePath);
                if (partDesc != null) {
                    LOG.info("Droping the partition : " + partDesc.getLocation() + " from "
                            + partDesc.getTableName() + " table");
                    try {
                        Table table = Hive.get().getTable(Conduit.getHcatDBName(), partDesc.getTableName(), true);

                        List<Partition> partitions = getPartitionsByFilter(partDesc, table);
                        // drop all partitions for a specified partitionSpec
                        if (dropPartitions(partDesc, partitions)) {
                            pathPartitionDescMap.remove(purgePath);
                        } else {
                            LOG.info("Did not drop all the partitions for " + partDesc.getPartSpec()
                                    + " partspec from " + partDesc.getTableName());
                            break;
                        }
                    } catch (HiveException e) {
                        if (e.getCause() instanceof NoSuchObjectException) {
                            LOG.warn("partition " + partDesc.getLocation() + " does not" + " exist in "
                                    + partDesc.getTableName() + " table");
                        } else {
                            updatePartitionFailureMetrics();
                            throw e;
                        }
                    }
                }
            }
            try {
                LOG.info("Purging [" + purgePath + "]");
                fs.delete(purgePath, true);
                ConduitMetrics.updateSWGuage(getServiceType(), PURGEPATHS_COUNT, getName(), 1);
            } catch (Exception e) {
                LOG.warn("Cannot delete path " + purgePath, e);
                ConduitMetrics.updateSWGuage(getServiceType(), DELETE_FAILURES_COUNT, getName(), 1);
            }
        }
    }

    private boolean dropPartitions(PartitionDesc partDesc, List<Partition> partitions) throws HiveException {
        if (partitions == null) {
            return true;
        }
        for (Partition partition : partitions) {
            if (Hive.get().dropPartition(Conduit.getHcatDBName(), partDesc.getTableName(), partition.getValues(),
                    true)) {
                LOG.debug("partition " + partition.getLocation() + " dropped" + " successfully from "
                        + partDesc.getTableName() + " table");
            } else {
                LOG.warn("Not able to drop the partition " + partDesc.getLocation() + "from "
                        + partDesc.getTableName() + " table");
                return false;
            }
        }
        return true;
    }

    private void updatePartitionFailureMetrics() {
        ConduitMetrics.updateSWGuage(getServiceType(), HCAT_PURGE_PARTITION_FAILURES_COUNT, getName(), 1);
        ConduitMetrics.updateSWGuage(getServiceType(), HCAT_CONNECTION_FAILURES, getName(), 1);
    }

    private static String getFilterString(Map<String, String> partitionSpec) {
        final String AND = " AND ";

        StringBuilder filter = new StringBuilder();
        for (Map.Entry<String, String> entry : partitionSpec.entrySet()) {
            filter.append(entry.getKey()).append("=").append("\"").append(entry.getValue()).append("\"")
                    .append(AND);
        }

        int length = filter.toString().length();
        if (length > 0)
            filter.delete(length - AND.length(), length);

        return filter.toString();
    }

    private List<Partition> getPartitionsByFilter(PartitionDesc partDesc, Table table) throws HiveException {
        String filterString = partDesc.getFilterString();
        LOG.info("filter String " + filterString + " for dropping partition");
        try {
            return Hive.get().getPartitionsByFilter(table, filterString);
        } catch (MetaException e) {
            throw new HiveException(e.getCause());
        } catch (NoSuchObjectException e) {
            LOG.info("got NoSuchObject exception while trying to list" + " parttions from "
                    + partDesc.getTableName() + " for filter " + filterString);
            return null;
        } catch (TException e) {
            throw new HiveException(e.getCause());
        }
    }

    private FileStatus[] getAllFilesInDir(Path dir, FileSystem fs) throws Exception {
        FileStatus[] files = null;
        try {
            files = fs.listStatus(dir);
        } catch (FileNotFoundException e) {

        }
        return files;
    }

    @Override
    protected String getTier() {
        throw new UnsupportedOperationException(" requested method is not" + " implemented in purger service");
    }

    @Override
    protected String getTopicNameFromDestnPath(Path destnPath) {
        throw new UnsupportedOperationException(" requested method is not" + " implemented in purger service");
    }

    /**
     * Get the service name 
     */
    public String getServiceType() {
        return "DataPurgerService";
    }

    @Override
    protected String getTableName(String stream) {
        throw new UnsupportedOperationException(" requested method is not" + " implemented in purger service");
    }
}