cascading.flow.hadoop.FlowPlatformTest.java Source code

Java tutorial

Introduction

Here is the source code for cascading.flow.hadoop.FlowPlatformTest.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;

import cascading.PlatformTestCase;
import cascading.flow.FailingFlowListener;
import cascading.flow.Flow;
import cascading.flow.FlowProcess;
import cascading.flow.FlowStep;
import cascading.flow.Flows;
import cascading.flow.LockingFlowListener;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.FlowStepJob;
import cascading.operation.BaseOperation;
import cascading.operation.Debug;
import cascading.operation.Filter;
import cascading.operation.FilterCall;
import cascading.operation.Function;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.platform.hadoop.BaseHadoopPlatform;
import cascading.property.AppProps;
import cascading.scheme.hadoop.TextLine;
import cascading.stats.CascadingStats;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.Lfs;
import cascading.tuple.Fields;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static data.InputData.inputFileLower;
import static data.InputData.inputFileUpper;
import static java.util.concurrent.Executors.newSingleThreadExecutor;

/**
 *
 */
public class FlowPlatformTest extends PlatformTestCase {
    private static final Logger LOG = LoggerFactory.getLogger(FlowPlatformTest.class);

    public FlowPlatformTest() {
        super(true); // must be run in cluster mode
    }

    // test is not executed, just guarantees flow is run locally

    @Test
    public void testLocalModeSource() throws Exception {
        Tap source = new Lfs(new TextLine(), "input/path");
        Tap sink = new Hfs(new TextLine(), "output/path", SinkMode.REPLACE);

        Pipe pipe = new Pipe("test");

        Flow flow = getPlatform().getFlowConnector().connect(source, sink, pipe);

        List<FlowStep> steps = flow.getFlowSteps();

        assertEquals("wrong size", 1, steps.size());

        FlowStep step = steps.get(0);

        boolean isLocal = HadoopUtil.isLocal((Configuration) step.getConfig());

        assertTrue("is not local", isLocal);
    }

    // test is not executed, just guarantees flow is run locally

    @Test
    public void testLocalModeSink() throws Exception {
        Tap source = new Hfs(new TextLine(), "input/path");
        Tap sink = new Lfs(new TextLine(), "output/path", SinkMode.REPLACE);

        Pipe pipe = new Pipe("test");

        Flow flow = getPlatform().getFlowConnector().connect(source, sink, pipe);

        List<FlowStep> steps = flow.getFlowSteps();

        assertEquals("wrong size", 1, steps.size());

        FlowStep step = steps.get(0);

        boolean isLocal = HadoopUtil.isLocal((Configuration) step.getConfig());

        assertTrue("is not local", isLocal);
    }

    // test is not executed, just guarantees flow is run on cluster
    @Test
    public void testNotLocalMode() throws Exception {
        if (!getPlatform().isUseCluster())
            return;

        Tap source = new Hfs(new TextLine(), "input/path");
        Tap sink = new Hfs(new TextLine(), "output/path", SinkMode.REPLACE);

        Pipe pipe = new Pipe("test");

        Flow flow = getPlatform().getFlowConnector().connect(source, sink, pipe);

        List<FlowStep> steps = flow.getFlowSteps();

        assertEquals("wrong size", 1, steps.size());

        FlowStep step = steps.get(0);

        boolean isLocal = HadoopUtil
                .isLocal((Configuration) ((BaseFlowStep) step).createInitializedConfig(flow.getFlowProcess(),
                        ((BaseHadoopPlatform) getPlatform()).getConfiguration()));

        assertTrue("is local", !isLocal);
    }

    @Test
    public void testStop() throws Exception {
        // introduces race condition if run without cluster
        if (!getPlatform().isUseCluster())
            return;

        getPlatform().copyFromLocal(inputFileLower);
        getPlatform().copyFromLocal(inputFileUpper);

        Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);
        Tap sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), inputFileUpper);

        Map sources = new HashMap();

        sources.put("lower", sourceLower);
        sources.put("upper", sourceUpper);

        Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

        // using null pos so all fields are written
        Tap sink = new Hfs(new TextLine(), getOutputPath("stopped"), SinkMode.REPLACE);

        Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

        pipeLower = new GroupBy(pipeLower, new Fields("num"));

        Pipe pipeUpper = new Each(new Pipe("upper"), new Fields("line"), splitter);

        pipeUpper = new GroupBy(pipeUpper, new Fields("num"));

        Pipe splice = new CoGroup(pipeLower, new Fields("num"), pipeUpper, new Fields("num"), Fields.size(4));

        final Flow flow = getPlatform().getFlowConnector(getProperties()).connect(sources, sink, splice);

        //    countFlow.writeDOT( "stopped.dot" );

        final LockingFlowListener listener = new LockingFlowListener();

        flow.addListener(listener);

        LOG.info("calling start");
        flow.start();

        Util.safeSleep(5000);

        assertTrue("did not start", listener.started.tryAcquire(60, TimeUnit.SECONDS));

        while (true) {
            LOG.info("testing if running");
            Thread.sleep(1000);

            Map<String, FlowStepJob> map = Flows.getJobsMap(flow);

            if (map == null || map.values().size() == 0)
                continue;

            FlowStepJob flowStepJob = map.values().iterator().next();

            if (flowStepJob.getStepStats().getStatus() == CascadingStats.Status.FAILED)
                fail("failed to start Hadoop step, please check your environment.");

            if (flowStepJob.isStarted())
                break;
        }

        final Semaphore start = new Semaphore(0);
        final long startTime = System.nanoTime();

        Future<Long> future = newSingleThreadExecutor().submit(new Callable<Long>() {
            @Override
            public Long call() throws Exception {
                start.release();
                LOG.info("calling complete");
                flow.complete();
                return System.nanoTime() - startTime;
            }
        });

        start.acquire();
        LOG.info("calling stop");
        flow.stop();

        long stopTime = System.nanoTime() - startTime;
        long completeTime = future.get();

        assertTrue(String.format("stop: %s complete: %s", stopTime, completeTime), stopTime <= completeTime);

        assertTrue("did not stop", listener.stopped.tryAcquire(60, TimeUnit.SECONDS));
        assertTrue("did not complete", listener.completed.tryAcquire(60, TimeUnit.SECONDS));
    }

    private static class BadFilter extends BaseOperation implements Filter {
        private Object object = new Object(); // intentional

        public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
            return false;
        }
    }

    @Test
    public void testFailedSerialization() throws Exception {
        getPlatform().copyFromLocal(inputFileLower);

        Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);

        Map sources = new HashMap();

        sources.put("lower", sourceLower);

        Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

        // using null pos so all fields are written
        Tap sink = new Hfs(new TextLine(), getOutputPath("badserialization"), SinkMode.REPLACE);

        Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

        pipeLower = new Each(pipeLower, new Fields("num"), new BadFilter());

        pipeLower = new GroupBy(pipeLower, new Fields("num"));

        try {
            // assembly serialization now happens during Flow construction, no chance to use a listener to catch
            Flow flow = getPlatform().getFlowConnector(getProperties()).connect(sources, sink, pipeLower);
            fail("did not throw serialization exception");
        } catch (Exception exception) {
            // ignore
        }
    }

    @Test
    public void testStartStopRace() throws Exception {
        getPlatform().copyFromLocal(inputFileLower);

        Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);

        Map sources = new HashMap();

        sources.put("lower", sourceLower);

        Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

        // using null pos so all fields are written
        Tap sink = new Hfs(new TextLine(), getOutputPath("startstop"), SinkMode.REPLACE);

        Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

        pipeLower = new GroupBy(pipeLower, new Fields("num"));

        Flow flow = getPlatform().getFlowConnector(getProperties()).connect(sources, sink, pipeLower);

        flow.start();
        flow.stop(); // should not fail
    }

    @Test
    public void testFailingListenerStarting() throws Exception {
        failingListenerTest(FailingFlowListener.OnFail.STARTING);
    }

    @Test
    public void testFailingListenerStopping() throws Exception {
        failingListenerTest(FailingFlowListener.OnFail.STOPPING);
    }

    @Test
    public void testFailingListenerCompleted() throws Exception {
        failingListenerTest(FailingFlowListener.OnFail.COMPLETED);
    }

    @Test
    public void testFailingListenerThrowable() throws Exception {
        failingListenerTest(FailingFlowListener.OnFail.THROWABLE);
    }

    private void failingListenerTest(FailingFlowListener.OnFail onFail) throws Exception {
        getPlatform().copyFromLocal(inputFileLower);
        getPlatform().copyFromLocal(inputFileUpper);

        Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);
        Tap sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), inputFileUpper);

        Map sources = new HashMap();

        sources.put("lower", sourceLower);
        sources.put("upper", sourceUpper);

        Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

        // using null pos so all fields are written
        Tap sink = new Hfs(new TextLine(), getOutputPath(onFail + "/stopped"), SinkMode.REPLACE);

        Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

        if (onFail == FailingFlowListener.OnFail.THROWABLE) {
            pipeLower = new Each(pipeLower, new Debug() {
                @Override
                public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
                    throw new RuntimeException("failing inside pipe assembly intentionally");
                }
            });
        }

        pipeLower = new GroupBy(pipeLower, new Fields("num"));

        Pipe pipeUpper = new Each(new Pipe("upper"), new Fields("line"), splitter);

        pipeUpper = new GroupBy(pipeUpper, new Fields("num"));

        Pipe splice = new CoGroup(pipeLower, new Fields("num"), pipeUpper, new Fields("num"), Fields.size(4));

        Flow flow = getPlatform().getFlowConnector(getProperties()).connect(sources, sink, splice);

        //    countFlow.writeDOT( "stopped.dot" );

        FailingFlowListener listener = new FailingFlowListener(onFail);

        flow.addListener(listener);

        LOG.info("calling start");
        flow.start();

        assertTrue("did not start", listener.started.tryAcquire(120, TimeUnit.SECONDS));

        if (onFail == FailingFlowListener.OnFail.STOPPING) {
            while (true) {
                LOG.info("testing if running");
                Thread.sleep(1000);

                Map<String, FlowStepJob> map = Flows.getJobsMap(flow);

                if (map == null || map.values().size() == 0)
                    continue;

                FlowStepJob flowStepJob = map.values().iterator().next();

                if (flowStepJob.getStepStats().getStatus() == CascadingStats.Status.FAILED)
                    fail("failed to start Hadoop step, please check your environment.");

                if (flowStepJob.isStarted())
                    break;
            }

            LOG.info("calling stop");

            flow.stop();
        }

        assertTrue("did not complete", listener.completed.tryAcquire(360, TimeUnit.SECONDS));
        assertTrue("did not stop", listener.stopped.tryAcquire(360, TimeUnit.SECONDS));

        try {
            flow.complete();
            fail("did not rethrow exception from listener");
        } catch (Exception exception) {
            // ignore
        }
    }

    @Test
    public void testFlowID() throws Exception {
        Tap source = new Lfs(new TextLine(), "input/path");
        Tap sink = new Hfs(new TextLine(), "output/path", SinkMode.REPLACE);

        Pipe pipe = new Pipe("test");

        Map<Object, Object> props = getProperties();
        Flow flow1 = getPlatform().getFlowConnector(props).connect(source, sink, pipe);

        //    System.out.println( "flow.getID() = " + flow1.getID() );

        assertNotNull("missing id", flow1.getID());

        assertNotNull("missing id in conf", flow1.getProperty("cascading.flow.id"));

        Flow flow2 = getPlatform().getFlowConnector(props).connect(source, sink, pipe);

        assertTrue("same id", !flow1.getID().equalsIgnoreCase(flow2.getID()));
    }

    @Test
    public void testCopyConfig() throws Exception {
        Tap source = new Lfs(new TextLine(), "input/path");
        Tap sink = new Hfs(new TextLine(), "output/path", SinkMode.REPLACE);

        Pipe pipe = new Pipe("test");

        Configuration conf = ((BaseHadoopPlatform) getPlatform()).getConfiguration();

        conf.set(AppProps.APP_NAME, "testname");

        AppProps props = AppProps.appProps().setVersion("1.2.3");

        Properties properties = props.buildProperties(conf); // convert job conf to properties instance

        Flow flow = getPlatform().getFlowConnector(properties).connect(source, sink, pipe);

        assertEquals("testname", flow.getProperty(AppProps.APP_NAME));
        assertEquals("1.2.3", flow.getProperty(AppProps.APP_VERSION));
    }

    @Test
    public void testStartWithoutComplete() throws Exception {
        getPlatform().copyFromLocal(inputFileLower);

        Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);

        Map sources = new HashMap();

        sources.put("lower", sourceLower);

        Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

        // using null pos so all fields are written
        Tap sink = new Hfs(new TextLine(), getOutputPath("withoutcomplete"), SinkMode.REPLACE);

        Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

        pipeLower = new GroupBy(pipeLower, new Fields("num"));

        Flow flow = getPlatform().getFlowConnector(getProperties()).connect(sources, sink, pipeLower);

        LockingFlowListener listener = new LockingFlowListener();

        flow.addListener(listener);

        flow.start();

        assertTrue(listener.completed.tryAcquire(90, TimeUnit.SECONDS));
    }
}