$.WordCount.java Source code

Java tutorial

Introduction

Here is the source code for $.WordCount.java

Source

    ####Licensed to the Apache Software Foundation(ASF)under one##or more contributor license agreements.See the NOTICE file##distributed with this work for additional information##regarding copyright ownership.The ASF licenses this file##to you under the Apache License,Version 2.0(the##"License");you may not use this file except in compliance##with the License.You may obtain a copy of the License at####http://www.apache.org/licenses/LICENSE-2.0
    ####Unless required by applicable law or agreed to in writing,##software distributed under the License is distributed on an##"AS IS"BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY##KIND,either express or implied.See the License for the##specific language governing permissions and limitations##under the License.###set($symbol_pound='#')#set($symbol_dollar='$')#set($symbol_escape='\')
package ${package};

    import org.apache.crunch.PCollection;
    import org.apache.crunch.PTable;
    import org.apache.crunch.Pipeline;
    import org.apache.crunch.PipelineResult;
    import org.apache.crunch.impl.mr.MRPipeline;
    import org.apache.crunch.types.writable.Writables;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.util.GenericOptionsParser;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;

    /**
     * A word count example for Apache Crunch, based on Crunch's example projects.
     */
    public class WordCount extends Configured implements Tool {

        public static void main(String[] args) throws Exception {
            ToolRunner.run(new Configuration(), new WordCount(), args);
        }

        public int run(String[] args) throws Exception {

            if (args.length != 2) {
                System.err.println(
                        "Usage: hadoop jar ${artifactId}-${version}-job.jar" + " [generic options] input output");
                System.err.println();
                GenericOptionsParser.printGenericCommandUsage(System.err);
                return 1;
            }

            String inputPath = args[0];
            String outputPath = args[1];

            // Create an object to coordinate pipeline creation and execution.
            Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

            // Reference a given text file as a collection of Strings.
            PCollection<String> lines = pipeline.readTextFile(inputPath);

            // Define a function that splits each line in a PCollection of Strings into
            // a PCollection made up of the individual words in the file.
            // The second argument sets the serialization format.
            PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

            // Take the collection of words and remove known stop words.
            PCollection<String> noStopWords = words.filter(new StopWordFilter());

            // The count method applies a series of Crunch primitives and returns
            // a map of the unique words in the input PCollection to their counts.
            PTable<String, Long> counts = noStopWords.count();

            // Instruct the pipeline to write the resulting counts to a text file.
            pipeline.writeTextFile(counts, outputPath);

            // Execute the pipeline as a MapReduce.
            PipelineResult result = pipeline.done();

            return result.succeeded() ? 0 : 1;
        }
    }