com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java Source code

Introduction

Here is the source code for com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
Source

/**
 * Copyright (C) 2004-2014 Synerzip. 
 * 
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package com.synerzip.analytics.commoncrawl.googleads.counter;

import java.net.URI;

import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import com.martinkl.warc.mapreduce.WARCInputFormat;
import com.synerzip.analytics.commoncrawl.common.WarcFileFilter;
import com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterMapper.TestCounters;

/**
 * The MapReduce Job to find how many HTML Pages in Common Crawl Dataset use
 * Google Ads for Analytics.
 * 
 * @author Sumeet Nikam
 *
 */
public class GoogleAdsCounterJob extends Configured implements Tool {

    private static final Logger LOG = Logger.getLogger(GoogleAdsCounterJob.class);
    private static final String ARGNAME_INPATH = "-in";
    private static final String ARGNAME_OUTPATH = "-out";
    private static final String ARGNAME_OVERWRITE = "-overwrite";
    private static final String ARGNAME_MAXFILES = "-maxfiles";
    private static final String FILEFILTER = ".warc.gz";
    private static final String ARGNAME_S3ACCESSKEY = "-accesskey";
    private static final String ARGNAME_S3SECRETKEY = "-secretkey";

    /**
     * Main Method for MapReduce Job
     * 
     * @param args
     * @throws Exception
     *             That arises while running this Hadoop Map Reduce Job.
     */
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new GoogleAdsCounterJob(), args);
        System.exit(res);

    }

    /**
     * Prints the usage of this Map Reduce program on the console.
     */
    private void usage() {
        System.out.println("\n  com.synerzip.analytics.commoncrawl.googleads.counter \n"
                + "                           " + ARGNAME_INPATH + " <inputpath>\n" + "                           "
                + ARGNAME_OUTPATH + " <outputpath>\n" + "                         [ " + ARGNAME_S3ACCESSKEY
                + " <accesskey> ]\n" + "                         [ " + ARGNAME_S3SECRETKEY + " <secretkey> ]\n"
                + "                         [ " + ARGNAME_OVERWRITE + " ]\n" + "                         [ "
                + ARGNAME_MAXFILES + " <maxfiles> ]");
        System.out.println("");
        GenericOptionsParser.printGenericCommandUsage(System.out);
    }

    /**
     * Configures and submits the Map Reduce Job to Hadoop
     */
    public int run(String[] args) throws Exception {

        String inputPath = null;
        String outputPath = null;
        boolean overwrite = false;
        String s3AccessKey = null;
        String s3SecretKey = null;

        // Read the command line arguments. We're not using GenericOptionsParser
        // to prevent having to include commons.cli as a dependency.
        for (int index = 0; index < args.length; index++) {
            try {

                if (ARGNAME_INPATH.equals(args[index])) {
                    inputPath = args[++index];
                } else if (ARGNAME_OUTPATH.equals(args[index])) {
                    outputPath = args[++index];
                } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                    s3AccessKey = args[++index];
                } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                    s3SecretKey = args[++index];
                } else if (ARGNAME_MAXFILES.equals(args[index])) {
                    // FIXME - No use of static methods
                    WarcFileFilter.setMax(Long.parseLong(args[++index]));
                } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                    overwrite = true;
                } else {
                    LOG.warn("Unsupported argument: " + args[index]);
                }
            } catch (ArrayIndexOutOfBoundsException e) {
                usage();
                throw new IllegalArgumentException();
            }
        }

        if (inputPath == null || outputPath == null) {
            usage();
            throw new IllegalArgumentException();
        }

        if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
            usage();
            LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
            throw new IllegalArgumentException();
        }

        // Create the Hadoop job.
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(GoogleAdsCounterJob.class);
        if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
            conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
            conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
        }
        // Scan the provided input path for WARC files.
        LOG.info("setting input path to '" + inputPath + "'");

        WarcFileFilter.setFilter(FILEFILTER);
        FileInputFormat.addInputPath(job, new Path(inputPath));

        // FIXME - I see the problem that you want to give a dynamic number to a
        // static class. My question is, Is this really required, if we just
        // point to a file in s3 that should solve our problem
        FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

        // Delete the output path directory if it already exists and user wants
        // to overwrite it.
        if (overwrite) {
            LOG.info("clearing the output path at '" + outputPath + "'");
            FileSystem fs = FileSystem.get(new URI(outputPath), conf);
            if (fs.exists(new Path(outputPath))) {
                fs.delete(new Path(outputPath), true);
            }
        }

        // Set the path where final output 'part' files will be saved.
        LOG.info("setting output path to '" + outputPath + "'");
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        /*
         * // Defines additional single text based output 'GoogleAdClient' for
         * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
         * TextOutputFormat.class, Text.class,LongWritable.class );
         * 
         * // Defines additional text based output 'GoogleAdType' for the job
         * MultipleOutputs.addNamedOutput(job,
         * "GoogleAdType",TextOutputFormat.class, Text.class,
         * LongWritable.class);
         */
        // Set which InputFormat class to use.
        job.setInputFormatClass(WARCInputFormat.class);

        // Set which OutputFormat class to use.
        job.setOutputFormatClass(TextOutputFormat.class);

        /*
         * Using MultipleOutputs creates zero-sized default output e.g.: *
         * part-r-00000. To prevent this use LazyOutputFormat instead of
         * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
         * configuration.
         */
        // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //job.setNumReduceTasks(4);
        // Set the output data types.
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // Set which Mapper and Reducer classes to use.
        job.setMapperClass(GoogleAdsCounterMapper.class);
        // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
        job.setReducerClass(GoogleAdsCounterReducer.class);

        // set combiner
        //job.setCombinerClass(GoogleAdsCounterReducer.class);

        // set job name
        job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

        long startTime = System.currentTimeMillis();
        if (job.waitForCompletion(true)) {

            LOG.info("Job completion status : " + job.waitForCompletion(true));
            long endTime = System.currentTimeMillis();

            long difference = endTime - startTime;
            LOG.info("Elapsed milliseconds: " + difference);
            Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
            LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

            Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
            LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

            return 0;
        } else {
            return 1;
        }
    }

}