HadoopWordCount.java Source code

Java tutorial

Introduction

Here is the source code for HadoopWordCount.java

Source

/*
 * Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import com.hazelcast.jet.DAG;
import com.hazelcast.jet.Jet;
import com.hazelcast.jet.JetInstance;
import com.hazelcast.jet.Vertex;
import com.hazelcast.jet.config.InstanceConfig;
import com.hazelcast.jet.config.JetConfig;
import com.hazelcast.jet.processor.HdfsProcessors;
import com.hazelcast.jet.processor.Processors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

import javax.annotation.Nonnull;
import java.util.regex.Pattern;

import static com.hazelcast.jet.AggregateOperations.counting;
import static com.hazelcast.jet.Edge.between;
import static com.hazelcast.jet.Partitioner.HASH_CODE;
import static com.hazelcast.jet.Traversers.traverseArray;
import static com.hazelcast.jet.function.DistributedFunctions.entryKey;
import static com.hazelcast.jet.function.DistributedFunctions.wholeItem;
import static com.hazelcast.jet.processor.Processors.flatMap;
import static java.lang.Runtime.getRuntime;
import static java.lang.System.nanoTime;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

/**
 * Word count example adapted to read from and write to HDFS instead of Jet in
 * memory maps.
 * <p>
 * For more details about the word count DAG itself, please see the JavaDoc for the
 * {@code WordCount} class in {@code wordcount-core-api} sample.
 * <p>
 * {@link HdfsProcessors#readHdfs(JobConf) readHdsf()} is a processor factory
 * that can be used for reading from HDFS given a {@code JobConf} with input
 * paths and input formats. The files in the input folder will be split among
 * Jet processors, using {@code InputSplit}s.
 * <p>
 * {@link HdfsProcessors#writeHdfs(JobConf) writeHdfs()} writes the output to
 * the given output path, with each processor writing to a single file within
 * the path. The files are identified by the member ID and the local ID of the
 * writing processor. Unlike in MapReduce, the output files are not sorted by
 * key.
 * <p>
 * In this example, files are read from and written to using {@code
 * TextInputFormat} and {@code TextOutputFormat} respectively, but the example
 * can be adjusted to be used with any input/output format.
 */
public class HadoopWordCount {

    private static final String OUTPUT_PATH = "hadoop-word-count";

    public static void main(String[] args) throws Exception {
        System.setProperty("hazelcast.logging.type", "log4j");

        Path inputPath = new Path(HadoopWordCount.class.getClassLoader().getResource("books").getPath());
        Path outputPath = new Path(OUTPUT_PATH);

        // set up the Hadoop job config, the input and output paths and formats
        JobConf jobConfig = new JobConf();
        jobConfig.setInputFormat(TextInputFormat.class);
        jobConfig.setOutputFormat(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(jobConfig, outputPath);
        TextInputFormat.addInputPath(jobConfig, inputPath);

        // Delete the output directory, if already exists
        FileSystem.get(new Configuration()).delete(outputPath, true);

        JetConfig cfg = new JetConfig();
        cfg.setInstanceConfig(new InstanceConfig()
                .setCooperativeThreadCount(Math.max(1, getRuntime().availableProcessors() / 2)));

        JetInstance jetInstance = Jet.newJetInstance(cfg);
        Jet.newJetInstance(cfg);

        try {
            System.out.print("\nCounting words from " + inputPath);
            long start = nanoTime();
            jetInstance.newJob(buildDag(jobConfig)).execute().get();
            System.out.print("Done in " + NANOSECONDS.toMillis(nanoTime() - start) + " milliseconds.");
            System.out.println("Output written to " + outputPath);
        } finally {
            Jet.shutdownAll();
        }
    }

    @Nonnull
    private static DAG buildDag(JobConf jobConf) {
        final Pattern delimiter = Pattern.compile("\\W+");

        DAG dag = new DAG();

        // read line number and line, and ignore the line number
        Vertex source = dag.newVertex("source", HdfsProcessors.readHdfs(jobConf, (k, v) -> v.toString()));
        // line -> words
        Vertex tokenize = dag.newVertex("tokenize",
                flatMap((String line) -> traverseArray(delimiter.split(line.toLowerCase()))
                        .filter(word -> !word.isEmpty())));
        // word -> (word, count)
        Vertex accumulate = dag.newVertex("accumulate", Processors.accumulateByKey(wholeItem(), counting()));
        // (word, count) -> (word, count)
        Vertex combine = dag.newVertex("combine", Processors.combineByKey(counting()));

        Vertex sink = dag.newVertex("sink", HdfsProcessors.writeHdfs(jobConf));

        return dag.edge(between(source, tokenize))
                .edge(between(tokenize, accumulate).partitioned(wholeItem(), HASH_CODE))
                .edge(between(accumulate, combine).distributed().partitioned(entryKey()))
                .edge(between(combine, sink));
    }
}