de.gesundkrank.wikipedia.hadoop.util.RepackToMapFile.java Source code

Java tutorial

Introduction

Here is the source code for de.gesundkrank.wikipedia.hadoop.util.RepackToMapFile.java

Source

/*
 * This file is part of wikipedia-hadoop.
 * Copyright (C) 2015 Jan Graegger
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see [http://www.gnu.org/licenses/].
 */

package de.gesundkrank.wikipedia.hadoop.util;

import de.gesundkrank.wikipedia.hadoop.WikiRevisionWritable;
import de.gesundkrank.wikipedia.hadoop.inputformat.WikiInputFormat;
import de.gesundkrank.wikipedia.hadoop.io.WikiDumpLoader;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * @author Jan Graegger<jan.grassegger@uni-weimar.de>
 */
public class RepackToMapFile extends Configured implements Tool {
    private static final Logger LOGGER = Logger.getLogger(RepackToMapFile.class);

    @Override
    public int run(String[] args) throws Exception {

        CommandLineParser parser = new GnuParser();
        Options options = getOptions();

        try {
            CommandLine commandLine = parser.parse(options, args);

            if (commandLine.hasOption('h')) {
                printHelp(options);
                return 0;
            }

            String basePath = commandLine.getOptionValue('b');
            String outputPath = commandLine.getOptionValue('o');
            boolean checkNew = commandLine.hasOption('c');
            boolean skipRedirect = commandLine.hasOption('r');

            return run(basePath, outputPath, checkNew, skipRedirect);

        } catch (ParseException e) {
            System.err.printf("Parsing failed.  Reason: %s%n", e.getMessage());
            printHelp(options);
            return 1;
        }
    }

    public int run(String basePath, String outputPath, boolean checkNew, boolean skipRedirect) throws Exception {
        Configuration configuration = getConf();
        configuration.setBoolean("skipRedirect", skipRedirect);

        LOGGER.info("Tool name: " + getClass().getSimpleName());

        Job job = Job.getInstance(configuration, getClass().getSimpleName());
        job.setJarByClass(getClass());

        job.setMapperClass(WikiMapper.class);
        job.setInputFormatClass(WikiInputFormat.class);
        job.setOutputFormatClass(MapFileOutputFormat.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(WikiRevisionWritable.class);

        WikiDumpLoader wikiDumpLoader = new WikiDumpLoader(checkNew);
        wikiDumpLoader.addWikiDump(job, basePath);

        MapFileOutputFormat.setOutputPath(job, new Path(outputPath));

        job.setNumReduceTasks(1);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    private static Options getOptions() {

        Options options = new Options();

        options.addOption("h", "help", false, "Show this message.");

        Option basePath = new Option("b", "basePath", true,
                "The path where your Wikipedia dumps get stored. " + "Creates subPaths.");
        basePath.setRequired(true);
        options.addOption(basePath);

        Option outputPath = new Option("o", "outputPath", true, "Path where the MapFile is stored.");
        outputPath.setRequired(true);
        options.addOption(outputPath);

        options.addOption("c", "checkNew", false, "Checks for new Wikipedia online.");
        options.addOption("r", "skipRedirect", false, "Skip redirect pages when indexing");

        return options;
    }

    private static void printHelp(Options options) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("hadoop jar <jar>", options);
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new RepackToMapFile(), args);
    }

    public static class WikiMapper
            extends Mapper<LongWritable, WikiRevisionWritable, LongWritable, WikiRevisionWritable> {

        private static Counter redirectPagesCounter;
        private static boolean skipRedirect;

        @Override
        protected void map(LongWritable key, WikiRevisionWritable value, Context context)
                throws IOException, InterruptedException {
            if (value.getPage().isRedirect()) {
                redirectPagesCounter.increment(1);

                if (skipRedirect) {
                    return;
                }
            }

            context.write(key, value);
        }

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            redirectPagesCounter = context.getCounter("Mapper", "redirectPages");
            skipRedirect = context.getConfiguration().getBoolean("skipRedirect", false);
        }
    }
}