com.inmobi.databus.distcp.MirrorStreamService.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.databus.distcp.MirrorStreamService.java

Source

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.inmobi.databus.distcp;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import com.inmobi.databus.Cluster;
import com.inmobi.databus.DatabusConfig;
import com.inmobi.databus.utils.DatePathComparator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCpOptions;

/* Assumption - Mirror is always of a merged Stream.There is only 1 instance of a merged Stream
 * (i)   1 Mirror Thread per src DatabusConfig.Cluster from where streams need to be mirrored on destCluster
 * (ii)  Mirror stream and mergedStream can't coexist on same Cluster
 * (iii) Mirror stream and merged Stream threads don't race with each other as they work on different
 * streams based on assumption(ii)
 */

public class MirrorStreamService extends DistcpBaseService {
    private static final Log LOG = LogFactory.getLog(MirrorStreamService.class);

    public MirrorStreamService(DatabusConfig config, Cluster srcCluster, Cluster destinationCluster)
            throws Exception {
        super(config, MirrorStreamService.class.getName(), srcCluster, destinationCluster);
    }

    @Override
    protected Path getInputPath() throws IOException {
        return getSrcCluster().getMirrorConsumePath(getDestCluster());
    }

    @Override
    protected void execute() throws Exception {

        try {
            boolean skipCommit = false;
            LinkedHashMap<Path, FileSystem> consumePaths = new LinkedHashMap<Path, FileSystem>();

            Path tmpOut = new Path(getDestCluster().getTmpPath(),
                    "distcp_mirror_" + getSrcCluster().getName() + "_" + getDestCluster().getName())
                            .makeQualified(getDestFs());
            // CleanuptmpOut before every run
            if (getDestFs().exists(tmpOut))
                getDestFs().delete(tmpOut, true);
            if (!getDestFs().mkdirs(tmpOut)) {
                LOG.warn("Cannot create [" + tmpOut + "]..skipping this run");
                return;
            }
            Path tmp = new Path(tmpOut, "tmp");
            if (!getDestFs().mkdirs(tmp)) {
                LOG.warn("Cannot create [" + tmp + "]..skipping this run");
                return;
            }

            Path inputFilePath = getDistCPInputFile(consumePaths, tmp);
            if (inputFilePath == null) {
                LOG.warn("No data to pull from " + "Cluster [" + getSrcCluster().getHdfsUrl() + "]"
                        + " to Cluster [" + getDestCluster().getHdfsUrl() + "]");
                return;
            }

            LOG.warn("Starting a Mirrored distcp pull from [" + inputFilePath.toString() + "] " + "Cluster ["
                    + getSrcCluster().getHdfsUrl() + "]" + " to Cluster [" + getDestCluster().getHdfsUrl() + "] "
                    + " Path [" + tmpOut.toString() + "]");

            DistCpOptions options = getDistCpOptions(inputFilePath, tmpOut);
            options.setPreserveSrcPath(true);

            try {
                if (!executeDistCp(options))
                    skipCommit = true;
            } catch (Throwable e) {
                LOG.warn("Problem in Mirrored distcp..skipping commit for this run", e);
                skipCommit = true;
            }
            if (!skipCommit) {
                LinkedHashMap<FileStatus, Path> commitPaths = prepareForCommit(tmpOut);
                doLocalCommit(commitPaths);
                doFinalCommit(consumePaths);
            }
            getDestFs().delete(tmpOut, true);
            LOG.debug("Cleanup [" + tmpOut + "]");
        } catch (Exception e) {
            LOG.warn(e);
            LOG.warn("Error in MirrorStream Service..skipping RUN ", e);
        }
    }

    void doLocalCommit(Map<FileStatus, Path> commitPaths) throws Exception {
        LOG.info("Committing " + commitPaths.size() + " paths.");
        for (Map.Entry<FileStatus, Path> entry : commitPaths.entrySet()) {
            LOG.info("Renaming [" + entry.getKey() + "] to [" + entry.getValue() + "]");
            if (entry.getKey().isDir()) {
                getDestFs().mkdirs(entry.getValue());
            } else {
                getDestFs().mkdirs(entry.getValue().getParent());
                if (getDestFs().rename(entry.getKey().getPath(), entry.getValue()) == false) {
                    LOG.warn("Failed to rename.Aborting transaction COMMIT to avoid "
                            + "data loss. Partial data replay could happen in next run");
                    throw new Exception(
                            "Rename failed from [" + entry.getKey() + "] to " + "[" + entry.getValue() + "]");
                }
            }
        }
    }

    /*
     * @returns Map<Path, Path> commitPaths - srcPath, destPath
     * 
     * @param Path - tmpOut
     */
    LinkedHashMap<FileStatus, Path> prepareForCommit(Path tmpOut) throws Exception {
        /*
         * tmpOut would be like -
         * /databus/system/tmp/distcp_mirror_<srcCluster>_<destCluster>/ After
         * distcp paths inside tmpOut would be eg:
         *
         * /databus/system/distcp_mirror_ua1_uj1
         * /databus/streams/<streamName>/2012/1/13/15/7/
         * <hostname>-<streamName>-2012-01-16-07-21_00000.gz
         *
         * tmpStreamRoot eg: /databus/system/distcp_mirror_<srcCluster>_
         * <destCluster>/databus/streams/
         */

        Path tmpStreamRoot = new Path(tmpOut.makeQualified(getDestFs()).toString() + File.separator
                + getSrcCluster().getUnqaulifiedFinalDestDirRoot());
        LOG.debug("tmpStreamRoot [" + tmpStreamRoot + "]");

        /* tmpStreamRoot eg -
         * /databus/system/tmp/distcp_mirror_<srcCluster>_<destCluster>/databus
         * /streams/
         *
         * multiple streams can get mirrored from the same cluster
         * streams can get processed in any order but we have to retain order
         * of paths within a stream*/
        FileStatus[] fileStatuses = getDestFs().listStatus(tmpStreamRoot);

        //Retain the order of commitPaths
        LinkedHashMap<FileStatus, Path> commitPaths = new LinkedHashMap<FileStatus, Path>();
        if (fileStatuses != null) {
            for (FileStatus streamRoot : fileStatuses) {
                //for each stream : list the path in order of YYYY/mm/DD/HH/MM
                LOG.debug("StreamRoot [" + streamRoot.getPath() + "] streamName [" + streamRoot.getPath().getName()
                        + "]");
                List<FileStatus> streamPaths = new ArrayList<FileStatus>();
                createListing(getDestFs(), streamRoot, streamPaths);
                Collections.sort(streamPaths, new DatePathComparator());
                LOG.debug("createListing size: [" + streamPaths.size() + "]");
                createCommitPaths(commitPaths, streamPaths);
            }
        }
        return commitPaths;
    }

    private void createCommitPaths(LinkedHashMap<FileStatus, Path> commitPaths, List<FileStatus> streamPaths) {
        /*  Path eg in streamPaths -
         *  /databus/system/distcp_mirror_<srcCluster>_<destCluster>/databus/streams
         *  /<streamName>/2012/1/13/15/7/<hostname>-<streamName>-2012-01-16-07
         *  -21_00000.gz
         *
         * or it could be an emptyDir like
         *  /* Path eg in streamPaths -
         *  /databus/system/distcp_mirror_<srcCluster>_<destCluster>/databus/streams
         *  /<streamName>/2012/1/13/15/7/
         *
         */

        for (FileStatus fileStatus : streamPaths) {
            String fileName = null;

            Path prefixDir = null;
            if (fileStatus.isDir()) {
                //empty directory
                prefixDir = fileStatus.getPath();
            } else {
                fileName = fileStatus.getPath().getName();
                prefixDir = fileStatus.getPath().getParent();
            }

            Path min = prefixDir;
            Path hr = min.getParent();
            Path day = hr.getParent();
            Path month = day.getParent();
            Path year = month.getParent();
            Path streamName = year.getParent();

            String finalPath = getDestCluster().getFinalDestDirRoot() + File.separator + streamName.getName()
                    + File.separator + year.getName() + File.separator + month.getName() + File.separator
                    + day.getName() + File.separator + hr.getName() + File.separator + min.getName();

            if (fileName != null) {
                finalPath += File.separator + fileName;
            }

            commitPaths.put(fileStatus, new Path(finalPath));
            LOG.debug("Going to commit [" + fileStatus.getPath() + "] to [" + finalPath + "]");
        }

    }

    void createListing(FileSystem fs, FileStatus fileStatus, List<FileStatus> results) throws IOException {
        if (fileStatus.isDir()) {
            FileStatus[] stats = fs.listStatus(fileStatus.getPath());
            if (stats.length == 0) {
                results.add(fileStatus);
                LOG.debug("createListing :: Adding [" + fileStatus.getPath() + "]");
            }
            for (FileStatus stat : stats) {
                createListing(fs, stat, results);
            }
        } else {
            LOG.debug("createListing :: Adding [" + fileStatus.getPath() + "]");
            results.add(fileStatus);
        }
    }
}