cascading.flow.tez.planner.Hadoop2TezFlowStepJob.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.tez.planner.Hadoop2TezFlowStepJob.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.tez.planner;

import java.io.File;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;

import cascading.CascadingException;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.FlowStepJob;
import cascading.flow.tez.Hadoop2TezFlow;
import cascading.flow.tez.Hadoop2TezFlowStep;
import cascading.management.state.ClientState;
import cascading.stats.FlowNodeStats;
import cascading.stats.FlowStepStats;
import cascading.stats.tez.TezStepStats;
import cascading.stats.tez.util.TezStatsUtil;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.tez.client.TezClient;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezException;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.StatusGetOpts;
import org.apache.tez.dag.api.client.VertexStatus;

import static cascading.flow.FlowProps.JOB_POLLING_INTERVAL;
import static cascading.stats.CascadingStats.STATS_STORE_INTERVAL;

/**
 *
 */
public class Hadoop2TezFlowStepJob extends FlowStepJob<TezConfiguration> {
    private static final Set<StatusGetOpts> STATUS_GET_OPTS = EnumSet.of(StatusGetOpts.GET_COUNTERS);

    private DAG dag;

    private TezClient tezClient;
    private DAGClient dagClient;

    private String dagId;

    private static long getStoreInterval(Configuration configuration) {
        return configuration.getLong(STATS_STORE_INTERVAL, 60 * 1000);
    }

    public static long getJobPollingInterval(Configuration configuration) {
        return configuration.getLong(JOB_POLLING_INTERVAL, 5000);
    }

    public Hadoop2TezFlowStepJob(ClientState clientState, BaseFlowStep<TezConfiguration> flowStep,
            TezConfiguration currentConf, DAG dag) {
        super(clientState, currentConf, flowStep, getJobPollingInterval(currentConf),
                getStoreInterval(currentConf));
        this.dag = dag;

        if (flowStep.isDebugEnabled())
            flowStep.logDebug("using polling interval: " + pollingInterval);
    }

    @Override
    protected FlowStepStats createStepStats(ClientState clientState) {
        return new TezStepStats(flowStep, clientState) {
            DAGClient timelineClient = null;

            @Override
            public synchronized DAGClient getJobStatusClient() {
                if (timelineClient != null)
                    return timelineClient;

                if (isTimelineServiceEnabled(jobConfiguration))
                    timelineClient = TezStatsUtil.createTimelineClient(dagClient); // may return null

                if (timelineClient == null)
                    timelineClient = dagClient;

                return timelineClient;
            }

            @Override
            public String getProcessStepID() {
                return dagId;
            }
        };
    }

    protected void internalNonBlockingStart() throws IOException {
        try {
            if (!isTimelineServiceEnabled(jobConfiguration))
                flowStep.logWarn("'" + YarnConfiguration.TIMELINE_SERVICE_ENABLED
                        + "' is disabled, please enable to capture detailed metrics of completed flows, this may require starting the YARN timeline server daemon");

            TezConfiguration workingConf = new TezConfiguration(jobConfiguration);

            // this could be problematic
            flowStep.logInfo("tez session mode enabled: " + workingConf.getBoolean(
                    TezConfiguration.TEZ_AM_SESSION_MODE, TezConfiguration.TEZ_AM_SESSION_MODE_DEFAULT));

            prepareEnsureStagingDir(workingConf);

            tezClient = TezClient.create(flowStep.getName(), workingConf,
                    ((Hadoop2TezFlowStep) flowStep).getAllLocalResources(), null);

            tezClient.start();

            dagClient = tezClient.submitDAG(dag);

            dagId = Util.returnInstanceFieldIfExistsSafe(dagClient, "dagId");

            flowStep.logInfo("submitted tez dag to app master: {}, with dag id: {}",
                    tezClient.getAppMasterApplicationId(), dagId);
        } catch (TezException exception) {
            throw new CascadingException(exception);
        }
    }

    private boolean isTimelineServiceEnabled(TezConfiguration workingConf) {
        return workingConf.getBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED,
                YarnConfiguration.DEFAULT_TIMELINE_SERVICE_ENABLED);
    }

    @Override
    protected void updateNodeStatus(FlowNodeStats flowNodeStats) {
        if (dagClient == null)
            return;

        try {
            VertexStatus vertexStatus = dagClient.getVertexStatus(flowNodeStats.getID(), null); // no counters

            if (vertexStatus == null)
                return;

            VertexStatus.State state = vertexStatus.getState();

            if (state == null)
                return;

            switch (state) {
            case NEW:
                break;

            case INITIALIZING:
                break;

            case INITED:
                break;

            case RUNNING:
                flowNodeStats.markRunning();
                break;

            case SUCCEEDED:
                if (!flowNodeStats.isRunning())
                    flowNodeStats.markRunning();

                flowNodeStats.markSuccessful();
                break;

            case FAILED:
                if (!flowNodeStats.isRunning())
                    flowNodeStats.markRunning();

                flowNodeStats.markFailed(null); // todo: lookup failure
                break;

            case KILLED:
                if (!flowNodeStats.isRunning())
                    flowNodeStats.markRunning();

                flowNodeStats.markStopped();
                break;

            case ERROR:
                if (!flowNodeStats.isRunning())
                    flowNodeStats.markRunning();

                flowNodeStats.markFailed(null); // todo: lookup failure
                break;

            case TERMINATING:
                break;
            }
        } catch (IOException | TezException exception) {
            flowStep.logError("failed setting node status", throwable);
        }
    }

    private Path prepareEnsureStagingDir(TezConfiguration workingConf) throws IOException {
        String stepStagingPath = createStepStagingPath();

        workingConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stepStagingPath);

        Path stagingDir = new Path(stepStagingPath);
        FileSystem fileSystem = FileSystem.get(workingConf);

        stagingDir = fileSystem.makeQualified(stagingDir);

        TokenCache.obtainTokensForNamenodes(new Credentials(), new Path[] { stagingDir }, workingConf);

        TezClientUtils.ensureStagingDirExists(workingConf, stagingDir);

        if (fileSystem.getScheme().startsWith("file:/"))
            new File(stagingDir.toUri()).mkdirs();

        return stagingDir;
    }

    String createStepStagingPath() {
        String result = "";

        if (HadoopUtil.isLocal(jobConfiguration))
            result = jobConfiguration.get("hadoop.tmp.dir") + Path.SEPARATOR;

        String flowStagingPath = ((Hadoop2TezFlow) flowStep.getFlow()).getFlowStagingPath();

        return result + flowStagingPath + Path.SEPARATOR + flowStep.getID();
    }

    private DAGStatus.State getDagStatusState() {
        DAGStatus dagStatus = getDagStatus();

        if (dagStatus == null) {
            flowStep.logWarn("getDagStatus returned null");

            return null;
        }

        DAGStatus.State state = dagStatus.getState();

        if (state == null)
            flowStep.logWarn("dagStatus#getState returned null");

        return state;
    }

    private boolean isDagStatusComplete() {
        DAGStatus dagStatus = getDagStatus();

        if (dagStatus == null)
            flowStep.logWarn("getDagStatus returned null");

        return dagStatus != null && dagStatus.isCompleted();
    }

    private DAGStatus getDagStatus() {
        if (dagClient == null)
            return null;

        try {
            return dagClient.getDAGStatus(null);
        } catch (NullPointerException exception) {
            flowStep.logWarn("NPE thrown by getDAGStatus, known issue");

            return null;
        } catch (IOException | TezException exception) {
            throw new CascadingException(exception);
        }
    }

    private DAGStatus getDagStatusWithCounters() {
        if (dagClient == null)
            return null;

        try {
            return dagClient.getDAGStatus(STATUS_GET_OPTS);
        } catch (IOException | TezException exception) {
            throw new CascadingException("unable to get counters from dag client", exception);
        }
    }

    protected void internalBlockOnStop() throws IOException {
        if (isDagStatusComplete())
            return;

        try {
            if (dagClient != null)
                dagClient.tryKillDAG(); // sometimes throws an NPE
        } catch (Exception exception) {
            flowStep.logWarn("exception during attempt to kill dag", exception);
        }

        stopDAGClient();
        stopTezClient();
    }

    @Override
    protected void internalCleanup() {
        stopDAGClient();
        stopTezClient();
    }

    private void stopDAGClient() {
        try {
            if (dagClient != null)
                dagClient.close(); // may throw an NPE
        } catch (Exception exception) {
            flowStep.logWarn("exception during attempt to cleanup client", exception);
        }
    }

    private void stopTezClient() {
        try {
            if (tezClient == null)
                return;

            if (isRemoteExecution()) {
                tezClient.stop(); // will shutdown the session
                return;
            }

            // the Tez LocalClient will frequently hang on #stop(), this causes tests to never complete
            Boolean result = Util.submitWithTimeout(new Callable<Boolean>() {
                @Override
                public Boolean call() throws Exception {
                    tezClient.stop();
                    return true;
                }
            }, 5, TimeUnit.MINUTES);

            if (result == null || !result)
                flowStep.logWarn("tezClient#stop() timed out after 5 minutes, cancelling call, continuing");
        } catch (Exception exception) {
            flowStep.logWarn("exception during attempt to cleanup client", exception);
        }
    }

    protected boolean internalNonBlockingIsSuccessful() throws IOException {
        return isDagStatusComplete() && getDagStatusState() == DAGStatus.State.SUCCEEDED;
    }

    @Override
    protected boolean isRemoteExecution() {
        return !HadoopUtil.isLocal(jobConfiguration);
    }

    @Override
    protected Throwable getThrowable() {
        return null;
    }

    protected String internalJobId() {
        return dagClient.getExecutionContext();
    }

    protected boolean internalNonBlockingIsComplete() throws IOException {
        return isDagStatusComplete();
    }

    protected void dumpDebugInfo() {
        //    try
        //      {
        //      if( dagStatus == null )
        //        return;

        //      flowStep.logWarn( "hadoop job " + runningJob.getID() + " state at " + JobStatus.getJobRunState( runningJob.getJobState() ) );
        //      flowStep.logWarn( "failure info: " + runningJob.getFailureInfo() );

        //      TaskCompletionEvent[] events = runningJob.getTaskCompletionEvents( 0 );
        //      flowStep.logWarn( "task completion events identify failed tasks" );
        //      flowStep.logWarn( "task completion events count: " + events.length );
        //
        //      for( TaskCompletionEvent event : events )
        //        flowStep.logWarn( "event = " + event );
        //      }
        //    catch( IOException exception )
        //      {
        //      flowStep.logError( "failed reading task completion events", exception );
        //      }
    }

    protected boolean internalIsStartedRunning() {
        // this is an alternative, seems to be set in tests sooner
        // but unsure if the tasks are actually engaged
        return getDagStatusState() == DAGStatus.State.RUNNING || isDagStatusComplete();
        /*
            DAGStatus dagStatus = getDagStatus();
            
            if( dagStatus == null )
              return false;
            
            Progress dagProgress = dagStatus.getDAGProgress();
            
            // not strictly true
            if( dagProgress == null )
              return false;
            
            // same as showing progress in map/reduce
            int completed = dagProgress.getRunningTaskCount()
              + dagProgress.getFailedTaskCount()
              + dagProgress.getKilledTaskCount()
              + dagProgress.getSucceededTaskCount();
            
            return completed > 0;
        */
    }
}