org.apache.crunch.MultipleOutputIT.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.crunch.MultipleOutputIT.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.crunch.PipelineResult.StageResult;
import org.apache.crunch.fn.Aggregators;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.At;
import org.apache.crunch.io.CrunchOutputs;
import org.apache.crunch.io.To;
import org.apache.crunch.test.StringWrapper;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.test.TemporaryPaths;
import org.apache.crunch.types.PTypeFamily;
import org.apache.crunch.types.avro.AvroTypeFamily;
import org.apache.crunch.types.avro.Avros;
import org.apache.crunch.types.writable.WritableTypeFamily;
import org.apache.crunch.types.writable.Writables;
import org.junit.Rule;
import org.junit.Test;

import com.google.common.collect.Lists;
import com.google.common.io.Files;

public class MultipleOutputIT {
    @Rule
    public TemporaryPath tmpDir = TemporaryPaths.create();

    public static PCollection<String> evenCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
        return words.parallelDo("even", new FilterFn<String>() {
            @Override
            public boolean accept(String input) {
                return input.length() % 2 == 0;
            }
        }, typeFamily.strings());
    }

    public static PCollection<String> oddCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
        return words.parallelDo("odd", new FilterFn<String>() {
            @Override
            public boolean accept(String input) {
                return input.length() % 2 != 0;
            }
        }, typeFamily.strings());

    }

    public static PTable<String, Long> substr(PTable<String, Long> ptable) {
        return ptable.parallelDo(new DoFn<Pair<String, Long>, Pair<String, Long>>() {
            public void process(Pair<String, Long> input, Emitter<Pair<String, Long>> emitter) {
                if (input.first().length() > 0) {
                    emitter.emit(Pair.of(input.first().substring(0, 1), input.second()));
                }
            }
        }, ptable.getPTableType());
    }

    @Test
    public void testWritables() throws IOException {
        run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()),
                WritableTypeFamily.getInstance());
    }

    @Test
    public void testAvro() throws IOException {
        run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
    }

    @Test
    public void testParallelDosFused() throws IOException {

        PipelineResult result = run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()),
                WritableTypeFamily.getInstance());

        // Ensure our multiple outputs were fused into a single job.
        assertEquals("parallel Dos not fused into a single job", 1, result.getStageResults().size());
    }

    @Test
    public void testCountersEnabled() throws IOException {
        PipelineResult result = run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()),
                WritableTypeFamily.getInstance());

        assertEquals(1, result.getStageResults().size());
        StageResult stageResult = result.getStageResults().get(0);

        String counterGroup = CrunchOutputs.class.getName();
        assertEquals(3, stageResult.getCounterNames().get(counterGroup).size());
        assertEquals(1l, stageResult.getCounterValue(counterGroup, "out1"));
        assertEquals(1l, stageResult.getCounterValue(counterGroup, "out2"));
        assertEquals(0l, stageResult.getCounterValue(counterGroup, "out3"));
    }

    @Test
    public void testCountersDisabled() throws IOException {
        Configuration configuration = tmpDir.getDefaultConfiguration();
        configuration.setBoolean(CrunchOutputs.CRUNCH_DISABLE_OUTPUT_COUNTERS, true);

        PipelineResult result = run(new MRPipeline(MultipleOutputIT.class, configuration),
                WritableTypeFamily.getInstance());

        assertEquals(1, result.getStageResults().size());
        StageResult stageResult = result.getStageResults().get(0);

        assertFalse(stageResult.getCounterNames().containsKey(CrunchOutputs.CRUNCH_OUTPUTS));
    }

    public PipelineResult run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
        String inputPath = tmpDir.copyResourceFileName("letters.txt");
        String outputPathEven = tmpDir.getFileName("even");
        String outputPathOdd = tmpDir.getFileName("odd");
        String outputPathReduce = tmpDir.getFileName("reduce");

        PCollection<String> words = pipeline.read(At.textFile(inputPath, typeFamily.strings()));

        PCollection<String> evenCountWords = evenCountLetters(words, typeFamily);
        PCollection<String> oddCountWords = oddCountLetters(words, typeFamily);
        pipeline.writeTextFile(evenCountWords, outputPathEven);
        pipeline.writeTextFile(oddCountWords, outputPathOdd);

        evenCountWords.by(new FirstLetterFn(), typeFamily.strings()).groupByKey()
                .combineValues(Aggregators.<String>FIRST_N(10)).write(To.textFile(outputPathReduce));

        PipelineResult result = pipeline.done();

        checkFileContents(outputPathEven, Arrays.asList("bb"));
        checkFileContents(outputPathOdd, Arrays.asList("a"));
        checkNotEmpty(outputPathReduce);

        return result;
    }

    static class FirstLetterFn extends MapFn<String, String> {
        @Override
        public String map(String input) {
            return input.substring(0, 1);
        }
    }

    /**
     * Mutates the state of an input and then emits the mutated object.
     */
    static class AppendFn extends DoFn<StringWrapper, StringWrapper> {

        private String value;

        public AppendFn(String value) {
            this.value = value;
        }

        @Override
        public void process(StringWrapper input, Emitter<StringWrapper> emitter) {
            input.setValue(input.getValue() + value);
            emitter.emit(input);
        }

    }

    /**
     * Fusing multiple pipelines has a risk of running into object reuse bugs.
     * This test verifies that mutating the state of an object that is passed
     * through multiple streams of a pipeline doesn't allow one stream to affect
     * another.
     */
    @Test
    public void testFusedMappersObjectReuseBug() throws IOException {
        Pipeline pipeline = new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration());
        PCollection<StringWrapper> stringWrappers = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
                .parallelDo(new StringWrapper.StringToStringWrapperMapFn(), Avros.reflects(StringWrapper.class));

        PCollection<String> stringsA = stringWrappers.parallelDo(new AppendFn("A"), stringWrappers.getPType())
                .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
        PCollection<String> stringsB = stringWrappers.parallelDo(new AppendFn("B"), stringWrappers.getPType())
                .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());

        String outputA = tmpDir.getFileName("stringsA");
        String outputB = tmpDir.getFileName("stringsB");

        pipeline.writeTextFile(stringsA, outputA);
        pipeline.writeTextFile(stringsB, outputB);
        PipelineResult pipelineResult = pipeline.done();

        // Make sure fusing did actually occur
        assertEquals(1, pipelineResult.getStageResults().size());

        checkFileContents(outputA, Lists.newArrayList("cA", "dA", "aA"));
        checkFileContents(outputB, Lists.newArrayList("cB", "dB", "aB"));

    }

    private void checkNotEmpty(String filePath) throws IOException {
        File dir = new File(filePath);
        File[] partFiles = dir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.startsWith("part");
            }
        });
        assertTrue(partFiles.length > 0);
        assertTrue(Files.readLines(partFiles[0], Charset.defaultCharset()).size() > 0);
    }

    private void checkFileContents(String filePath, List<String> expected) throws IOException {
        File outputFile = new File(filePath, "part-m-00000");
        List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
        assertEquals(expected, lines);
    }
}