BuildInvertedIndex.java Source code

Java tutorial

Introduction

Here is the source code for BuildInvertedIndex.java

Source

    /*
     * Cloud9: A Hadoop toolkit for working with big data
     *
     * Licensed under the Apache License, Version 2.0 (the "License"); you
     * may not use this file except in compliance with the License. You may
     * obtain a copy of the License at
     *
     * http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
     * implied. See the License for the specific language governing
     * permissions and limitations under the License.
     */

    import java.io.IOException;
    import java.util.Arrays;
    import java.util.Collections;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.Map.Entry;

    import org.apache.commons.cli.CommandLine;
    import org.apache.commons.cli.CommandLineParser;
    import org.apache.commons.cli.GnuParser;
    import org.apache.commons.cli.HelpFormatter;
    import org.apache.commons.cli.OptionBuilder;
    import org.apache.commons.cli.Options;
    import org.apache.commons.cli.ParseException;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    import org.apache.log4j.Logger;

    import tl.lin.data.array.ArrayListWritable;
    import tl.lin.data.fd.Object2IntFrequencyDistribution;
    import tl.lin.data.fd.Object2IntFrequencyDistributionEntry;
    import tl.lin.data.pair.PairOfInts;
    import tl.lin.data.pair.PairOfObjectInt;
    import tl.lin.data.pair.PairOfWritables;

    public class BuildInvertedIndex extends Configured implements Tool {
        private static final Logger LOG = Logger.getLogger(BuildInvertedIndex.class);

        private static class MyMapper extends Mapper<LongWritable, Text, Text, PairOfInts> {
            private static final Text WORD = new Text();
            //      private static final Object2IntFrequencyDistribution<String> COUNTS =
            //            new Object2IntFrequencyDistributionEntry<String>();
            private static final HashMap<String, Integer> COUNTS = new HashMap<String, Integer>();

            @Override
            public void map(LongWritable docno, Text doc, Context context) throws IOException, InterruptedException {
                String text = doc.toString();
                COUNTS.clear();

                String[] terms = text.split("\\s+");

                // First build a histogram of the terms.
                for (String term : terms) {
                    if (term == null || term.length() == 0) {
                        continue;
                    }
                    if (COUNTS.containsKey(term)) {
                        COUNTS.put(/* fill in your code here */);
                    } else {
                        COUNTS.put(/* fill in your code here */);
                    }
                }

                // Emit postings.
                for (Entry<String, Integer> entry : COUNTS.entrySet()) {
                    WORD.set(entry.getKey());
                    context.write(WORD, new PairOfInts((int) docno.get(), entry.getValue()));
                }
            }
        }

        private static class MyReducer
                extends Reducer<Text, PairOfInts, Text, PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>> {
            private final static IntWritable DF = new IntWritable();

      @Override
      public void reduce(Text key, Iterable<PairOfInts> values, Context context)
            throws IOException, InterruptedException {
//         Iterator<PairOfInts> iter = values.iterator();
         ArrayListWritable<PairOfInts> postings = new ArrayListWritable<PairOfInts>();

         int df = 0;
         for (PairOfInts pair : values) {
            /* fill in your code here */++;
            /* fill in your code here */.add(pair.clone());
         }
         
         // Sort the postings by docno ascending.
         Collections.sort(postings);

         DF.set(df);
         context.write(key,
               new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(DF, postings));
      }
        }

        private BuildInvertedIndex() {
        }

        private static final String INPUT = "input";
        private static final String OUTPUT = "output";
        private static final String NUM_REDUCERS = "numReducers";

        /**
         * Runs this tool.
         */
        @SuppressWarnings({ "static-access" })
        public int run(String[] args) throws Exception {
            Options options = new Options();

            options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
            options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
            options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
                    .create(NUM_REDUCERS));

            CommandLine cmdline;
            CommandLineParser parser = new GnuParser();

            try {
                cmdline = parser.parse(options, args);
            } catch (ParseException exp) {
                System.err.println("Error parsing command line: " + exp.getMessage());
                return -1;
            }

            if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
                System.out.println("args: " + Arrays.toString(args));
                HelpFormatter formatter = new HelpFormatter();
                formatter.setWidth(120);
                formatter.printHelp(this.getClass().getName(), options);
                ToolRunner.printGenericCommandUsage(System.out);
                return -1;
            }

            String inputPath = cmdline.getOptionValue(INPUT);
            String outputPath = cmdline.getOptionValue(OUTPUT);
            int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
                    : 1;

            LOG.info("Tool name: " + BuildInvertedIndex.class.getSimpleName());
            LOG.info(" - input path: " + inputPath);
            LOG.info(" - output path: " + outputPath);
            LOG.info(" - num reducers: " + reduceTasks);

            Job job = Job.getInstance(getConf());
            job.setJobName(BuildInvertedIndex.class.getSimpleName());
            job.setJarByClass(BuildInvertedIndex.class);

            job.setNumReduceTasks(reduceTasks);

            FileInputFormat.setInputPaths(job, new Path(inputPath));
            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(PairOfInts.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(PairOfWritables.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);

            // Delete the output directory if it exists already.
            Path outputDir = new Path(outputPath);
            FileSystem.get(getConf()).delete(outputDir, true);

            long startTime = System.currentTimeMillis();
            job.waitForCompletion(true);
            System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

            return 0;
        }

        /**
         * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
         */
        public static void main(String[] args) throws Exception {
            ToolRunner.run(new BuildInvertedIndex(), args);
        }
    }