Java tutorial
/** * Copyright (C) 2004-2014 Synerzip. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.synerzip.analytics.commoncrawl.googleads.counter; import java.net.URI; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import com.martinkl.warc.mapreduce.WARCInputFormat; import com.synerzip.analytics.commoncrawl.common.WarcFileFilter; import com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterMapper.TestCounters; /** * The MapReduce Job to find how many HTML Pages in Common Crawl Dataset use * Google Ads for Analytics. * * @author Sumeet Nikam * */ public class GoogleAdsCounterJob extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(GoogleAdsCounterJob.class); private static final String ARGNAME_INPATH = "-in"; private static final String ARGNAME_OUTPATH = "-out"; private static final String ARGNAME_OVERWRITE = "-overwrite"; private static final String ARGNAME_MAXFILES = "-maxfiles"; private static final String FILEFILTER = ".warc.gz"; private static final String ARGNAME_S3ACCESSKEY = "-accesskey"; private static final String ARGNAME_S3SECRETKEY = "-secretkey"; /** * Main Method for MapReduce Job * * @param args * @throws Exception * That arises while running this Hadoop Map Reduce Job. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new GoogleAdsCounterJob(), args); System.exit(res); } /** * Prints the usage of this Map Reduce program on the console. */ private void usage() { System.out.println("\n com.synerzip.analytics.commoncrawl.googleads.counter \n" + " " + ARGNAME_INPATH + " <inputpath>\n" + " " + ARGNAME_OUTPATH + " <outputpath>\n" + " [ " + ARGNAME_S3ACCESSKEY + " <accesskey> ]\n" + " [ " + ARGNAME_S3SECRETKEY + " <secretkey> ]\n" + " [ " + ARGNAME_OVERWRITE + " ]\n" + " [ " + ARGNAME_MAXFILES + " <maxfiles> ]"); System.out.println(""); GenericOptionsParser.printGenericCommandUsage(System.out); } /** * Configures and submits the Map Reduce Job to Hadoop */ public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } } }