com.cloudera.dataflow.spark.TransformTranslatorTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.dataflow.spark.TransformTranslatorTest.java

Source

/*
 * Copyright (c) 2015, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.dataflow.spark;

import com.google.api.client.repackaged.com.google.common.base.Joiner;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.common.base.Charsets;
import java.util.Collections;
import org.apache.commons.io.FileUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;

/**
 * A test for the transforms registered in TransformTranslator.
 * Builds a regular Dataflow pipeline with each of the mapped
 * transforms, and makes sure that they work when the pipeline is
 * executed in Spark.
 */
public class TransformTranslatorTest {

    @Rule
    public TestName name = new TestName();

    private DirectPipelineRunner directRunner;
    private SparkPipelineRunner sparkRunner;
    private String testDataDirName;

    @Before
    public void init() throws IOException {
        sparkRunner = SparkPipelineRunner.create();
        directRunner = DirectPipelineRunner.createForTest();
        testDataDirName = Joiner.on(File.separator).join("target", "test-data", name.getMethodName())
                + File.separator;
        FileUtils.deleteDirectory(new File(testDataDirName));
        new File(testDataDirName).mkdirs();
    }

    /**
     * Builds a simple pipeline with TextIO.Read and TextIO.Write, runs the pipeline
     * in DirectPipelineRunner and on SparkPipelineRunner, with the mapped dataflow-to-spark
     * transforms. Finally it makes sure that the results are the same for both runs.
     */
    @Test
    public void testTextIOReadAndWriteTransforms() throws IOException {
        String directOut = runPipeline("direct", directRunner);
        String sparkOut = runPipeline("spark", sparkRunner);

        List<String> directOutput = Files.readAllLines(Paths.get(directOut + "-00000-of-00001"), Charsets.UTF_8);

        List<String> sparkOutput = Files.readAllLines(Paths.get(sparkOut + "-00000-of-00001"), Charsets.UTF_8);

        // sort output to get a stable result (PCollections are not ordered)
        Collections.sort(directOutput);
        Collections.sort(sparkOutput);

        Assert.assertArrayEquals(directOutput.toArray(), sparkOutput.toArray());
    }

    private String runPipeline(String name, PipelineRunner<?> runner) {
        Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
        String outFile = Joiner.on(File.separator).join(testDataDirName, "test_text_out_" + name);
        PCollection<String> lines = p.apply(TextIO.Read.from("src/test/resources/test_text.txt"));
        lines.apply(TextIO.Write.to(outFile));
        runner.run(p);
        return outFile;
    }
}