com.inmobi.databus.distcp.MergedStreamService.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.databus.distcp.MergedStreamService.java

Source

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.inmobi.databus.distcp;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.inmobi.databus.Cluster;
import com.inmobi.databus.DatabusConfig;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/*
 * Handles MergedStreams for a Cluster
 */

public class MergedStreamService extends DistcpBaseService {

    private static final Log LOG = LogFactory.getLog(MergedStreamService.class);
    private Map<String, Set<Path>> missingDirsCommittedPaths = new HashMap<String, Set<Path>>();

    public MergedStreamService(DatabusConfig config, Cluster srcCluster, Cluster destinationCluster)
            throws Exception {
        super(config, MergedStreamService.class.getName(), srcCluster, destinationCluster);
    }

    @Override
    public void execute() throws Exception {
        try {
            boolean skipCommit = false;
            Map<Path, FileSystem> consumePaths = new HashMap<Path, FileSystem>();

            Path tmpOut = new Path(getDestCluster().getTmpPath(),
                    "distcp_mergedStream_" + getSrcCluster().getName() + "_" + getDestCluster().getName())
                            .makeQualified(getDestFs());
            // CleanuptmpOut before every run
            if (getDestFs().exists(tmpOut))
                getDestFs().delete(tmpOut, true);
            if (!getDestFs().mkdirs(tmpOut)) {
                LOG.warn("Cannot create [" + tmpOut + "]..skipping this run");
                return;
            }
            Path tmp = new Path(tmpOut, "tmp");
            if (!getDestFs().mkdirs(tmp)) {
                LOG.warn("Cannot create [" + tmp + "]..skipping this run");
                return;
            }

            synchronized (getDestCluster()) {
                addPublishMissingPaths(missingDirsCommittedPaths, -1, null);
            }

            Path inputFilePath = getDistCPInputFile(consumePaths, tmp);
            if (inputFilePath == null) {
                LOG.warn("No data to pull from " + "Cluster [" + getSrcCluster().getHdfsUrl() + "]"
                        + " to Cluster [" + getDestCluster().getHdfsUrl() + "]");
                if (missingDirsCommittedPaths.size() > 0) {
                    LOG.warn("Adding Missing Directories for Pull " + missingDirsCommittedPaths.size());
                    commitMirroredConsumerPaths(missingDirsCommittedPaths, tmp);
                }
                return;
            }
            LOG.warn("Starting a distcp pull from [" + inputFilePath.toString() + "] " + "Cluster ["
                    + getSrcCluster().getHdfsUrl() + "]" + " to Cluster [" + getDestCluster().getHdfsUrl() + "] "
                    + " Path [" + tmpOut.toString() + "]");

            try {
                if (!executeDistCp(getDistCpOptions(inputFilePath, tmpOut)))
                    skipCommit = true;
            } catch (Throwable e) {
                LOG.warn("Error in distcp", e);
                LOG.warn("Problem in MergedStream distcp PULL..skipping commit for this run");
                skipCommit = true;
            }
            Map<String, Set<Path>> committedPaths = null;

            // if success
            if (!skipCommit) {
                Map<String, List<Path>> categoriesToCommit = prepareForCommit(tmpOut);
                synchronized (getDestCluster()) {
                    long commitTime = getDestCluster().getCommitTime();
                    addPublishMissingPaths(missingDirsCommittedPaths, commitTime, categoriesToCommit.keySet());

                    // category, Set of Paths to commit
                    committedPaths = doLocalCommit(commitTime, categoriesToCommit);

                    for (Map.Entry<String, Set<Path>> entry : missingDirsCommittedPaths.entrySet()) {
                        Set<Path> filesList = committedPaths.get(entry.getKey());
                        if (filesList != null)
                            filesList.addAll(entry.getValue());
                        else
                            committedPaths.put(entry.getKey(), entry.getValue());
                    }
                }
                // Prepare paths for MirrorStreamConsumerService
                commitMirroredConsumerPaths(committedPaths, tmp);
                // Cleanup happens in parallel without sync
                // no race is there in consumePaths, tmpOut
                doFinalCommit(consumePaths);
            }
            // rmr tmpOut cleanup
            getDestFs().delete(tmpOut, true);
            LOG.debug("Deleting [" + tmpOut + "]");
        } catch (Exception e) {
            LOG.warn("Error in run ", e);
            throw new Exception(e);
        }
    }

    private void addPublishMissingPaths(Map<String, Set<Path>> missingDirsCommittedPaths, long commitTime,
            Set<String> categoriesToCommit) throws Exception {
        Map<String, Set<Path>> missingDirsforCategory = null;

        if (categoriesToCommit != null) {
            missingDirsforCategory = new HashMap<String, Set<Path>>();
            for (String category : categoriesToCommit) {
                Set<Path> missingDirectories = publishMissingPaths(getDestFs(),
                        getDestCluster().getFinalDestDirRoot(), commitTime, category);
                missingDirsforCategory.put(category, missingDirectories);
            }
        } else {
            missingDirsforCategory = publishMissingPaths(getDestFs(), getDestCluster().getFinalDestDirRoot());
        }

        if (missingDirsforCategory != null) {
            for (Map.Entry<String, Set<Path>> entry : missingDirsforCategory.entrySet()) {
                LOG.debug("Add Missing Directories to Commit Path: " + entry.getValue().size());
                if (missingDirsCommittedPaths.get(entry.getKey()) != null) {
                    Set<Path> missingPaths = missingDirsCommittedPaths.get(entry.getKey());
                    missingPaths.addAll(entry.getValue());
                } else {
                    missingDirsCommittedPaths.put(entry.getKey(), entry.getValue());
                }
            }
        }
    }

    /*
     * @param Map<String, Set<Path>> commitedPaths - Stream Name, It's committed
     * Path.
     */
    private void commitMirroredConsumerPaths(Map<String, Set<Path>> committedPaths, Path tmp) throws Exception {
        // Map of Stream and clusters where it's mirrored
        Map<String, Set<Cluster>> mirrorStreamConsumers = new HashMap<String, Set<Cluster>>();
        Map<Path, Path> consumerCommitPaths = new LinkedHashMap<Path, Path>();
        // for each stream in committedPaths
        for (String stream : committedPaths.keySet()) {
            // for each cluster
            for (Cluster cluster : getConfig().getClusters().values()) {
                // is this stream to be mirrored on this cluster
                if (cluster.getMirroredStreams().contains(stream)) {
                    Set<Cluster> mirrorConsumers = mirrorStreamConsumers.get(stream);
                    if (mirrorConsumers == null)
                        mirrorConsumers = new HashSet<Cluster>();
                    mirrorConsumers.add(cluster);
                    mirrorStreamConsumers.put(stream, mirrorConsumers);
                }
            }
        } // for each stream

        // Commit paths for each consumer
        for (String stream : committedPaths.keySet()) {
            // consumers for this stream
            Set<Cluster> consumers = mirrorStreamConsumers.get(stream);
            Path tmpConsumerPath;
            if (consumers == null || consumers.size() == 0) {
                LOG.warn(" Consumers is empty for stream [" + stream + "]");
                continue;
            }
            for (Cluster consumer : consumers) {
                // commit paths for this consumer, this stream
                // adding srcCluster avoids two Remote Copiers creating same filename
                String tmpPath = "src_" + getSrcCluster().getName() + "_via_" + getDestCluster().getName()
                        + "_mirrorto_" + consumer.getName() + "_" + stream;
                tmpConsumerPath = new Path(tmp, tmpPath);
                FSDataOutputStream out = getDestFs().create(tmpConsumerPath);
                try {
                    for (Path path : committedPaths.get(stream)) {
                        LOG.debug("Writing Mirror Commit Path [" + path.toString() + "]");
                        out.writeBytes(path.toString());
                        out.writeBytes("\n");
                    }
                } finally {
                    out.close();
                }
                // Two MergedStreamConsumers will write file for same consumer within
                // the same time
                // adding srcCLuster name avoids that conflict
                Path finalMirrorPath = new Path(getDestCluster().getMirrorConsumePath(consumer),
                        tmpPath + "_" + new Long(System.currentTimeMillis()).toString());
                consumerCommitPaths.put(tmpConsumerPath, finalMirrorPath);
            } // for each consumer
        } // for each stream

        if (consumerCommitPaths == null || consumerCommitPaths.size() == 0) {
            LOG.info("consumerCommitPaths is empty for all stream, skipping mirrorCommit");
            missingDirsCommittedPaths.clear();
            return;
        }
        // Do the final mirrorCommit
        LOG.info("Committing [" + consumerCommitPaths.size() + "] paths for " + "mirrored Stream");
        FileSystem fs = FileSystem.get(getDestCluster().getHadoopConf());
        for (Map.Entry<Path, Path> entry : consumerCommitPaths.entrySet()) {
            LOG.info("Renaming [" + entry.getKey() + "] to [" + entry.getValue() + "]");
            fs.mkdirs(entry.getValue().getParent());
            if (fs.rename(entry.getKey(), entry.getValue()) == false) {
                LOG.warn("Failed to Commit for Mirrored Path. Aborting Transaction " + "to avoid DATA LOSS, "
                        + "Partial data replay can happen for merged and mirror stream");
                throw new Exception("Rename failed from [" + entry.getKey() + "] to [" + entry.getValue() + "]");
            }
        }
        missingDirsCommittedPaths.clear();
    }

    private Map<String, List<Path>> prepareForCommit(Path tmpOut) throws Exception {
        Map<String, List<Path>> categoriesToCommit = new HashMap<String, List<Path>>();
        FileStatus[] allFiles = getDestFs().listStatus(tmpOut);
        for (int i = 0; i < allFiles.length; i++) {
            String fileName = allFiles[i].getPath().getName();
            if (fileName != null) {
                String category = getCategoryFromFileName(fileName);
                if (category != null) {
                    Path intermediatePath = new Path(tmpOut, category);
                    if (!getDestFs().exists(intermediatePath))
                        getDestFs().mkdirs(intermediatePath);
                    Path source = allFiles[i].getPath().makeQualified(getDestFs());

                    Path intermediateFilePath = new Path(
                            intermediatePath.makeQualified(getDestFs()).toString() + File.separator + fileName);
                    if (getDestFs().rename(source, intermediateFilePath) == false) {
                        LOG.warn("Failed to Rename [" + source + "] to [" + intermediateFilePath + "]");
                        LOG.warn("Aborting Tranasction prepareForCommit to avoid data "
                                + "LOSS. Retry would happen in next run");
                        throw new Exception("Rename [" + source + "] to [" + intermediateFilePath + "]");
                    }
                    LOG.debug("Moving [" + source + "] to intermediateFilePath [" + intermediateFilePath + "]");
                    List<Path> fileList = categoriesToCommit.get(category);
                    if (fileList == null) {
                        fileList = new ArrayList<Path>();
                        fileList.add(intermediateFilePath.makeQualified(getDestFs()));
                        categoriesToCommit.put(category, fileList);
                    } else {
                        fileList.add(intermediateFilePath);
                    }
                }
            }
        }
        return categoriesToCommit;
    }

    /*
     * @returns Map<String, Set<Path>> - Map of StreamName, Set of paths committed
     * for stream
     */
    private Map<String, Set<Path>> doLocalCommit(long commitTime, Map<String, List<Path>> categoriesToCommit)
            throws Exception {
        Map<String, Set<Path>> comittedPaths = new HashMap<String, Set<Path>>();
        Set<Map.Entry<String, List<Path>>> commitEntries = categoriesToCommit.entrySet();
        Iterator it = commitEntries.iterator();
        while (it.hasNext()) {
            Map.Entry<String, List<Path>> entry = (Map.Entry<String, List<Path>>) it.next();
            String category = entry.getKey();
            List<Path> filesInCategory = entry.getValue();
            for (Path filePath : filesInCategory) {
                Path destParentPath = new Path(getDestCluster().getFinalDestDir(category, commitTime));
                if (!getDestFs().exists(destParentPath)) {
                    getDestFs().mkdirs(destParentPath);
                }
                LOG.debug("Moving from intermediatePath [" + filePath + "] to [" + destParentPath + "]");
                if (getDestFs().rename(filePath, destParentPath) == false) {
                    LOG.warn("Rename failed, aborting transaction COMMIT to avoid "
                            + "dataloss. Partial data replay could happen in next run");
                    throw new Exception("Abort transaction Commit. Rename failed from [" + filePath + "] to ["
                            + destParentPath + "]");
                }
                Path commitPath = new Path(destParentPath, filePath.getName());
                Set<Path> commitPaths = comittedPaths.get(category);
                if (commitPaths == null) {
                    commitPaths = new HashSet<Path>();
                }
                commitPaths.add(commitPath);
                comittedPaths.put(category, commitPaths);
            }
        }
        return comittedPaths;
    }

    protected Path getInputPath() throws IOException {
        return getSrcCluster().getConsumePath(getDestCluster());

    }
}