List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:com.soteradefense.dga.louvain.mapreduce.CommunityCompression.java
License:Apache License
public int run(String[] args) throws Exception { Configuration mrConf = this.getConf(); for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); }//from w ww. j a va 2 s . c o m Job job = Job.getInstance(mrConf); job.setJarByClass(CommunityCompression.class); Path in = new Path(inputPath); Path out = new Path(outputPath); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CommunityCompression"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LouvainVertexWritable.class); job.setMapperClass(CommunityCompression.Map.class); job.setReducerClass(CommunityCompression.Reduce.class); logger.debug("Running Mapreduce step with job configuration: {}", job); return job.waitForCompletion(false) ? 0 : 1; }
From source file:com.soteradefense.dga.louvain.mapreduce.LouvainTableSynthesizer.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = null;// ww w. jav a 2 s.c o m try { int iteration = 0; if (!basePath.endsWith("/")) basePath = basePath + "/"; String inputPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + iteration; String joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1); String outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration; Configuration mrConf = this.getConf(); job = Job.getInstance(mrConf); for (Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); } FileSystem fs = FileSystem.get(job.getConfiguration()); boolean nextFileExists = fs.exists(new Path(joinPath)); while (nextFileExists) { System.out.println("Processing " + inputPath + " and " + joinPath); job = Job.getInstance(mrConf); job.setJobName("Louvain Table Synthesizer " + iteration); job.setJarByClass(LouvainTableSynthesizer.class); job.setMapperClass(LouvainTableSynthesizerMapper.class); job.setReducerClass(LouvainTableSynthesizerReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer Output job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //Add both input folders Path in = new Path(inputPath); Path joinIn = new Path(joinPath); Path out = new Path(outputPath); FileInputFormat.addInputPath(job, in); FileInputFormat.addInputPath(job, joinIn); FileOutputFormat.setOutputPath(job, out); job.waitForCompletion(true); //Set the new temp input path inputPath = outputPath; iteration++; outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration; joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1); nextFileExists = fs.exists(new Path(joinPath)); } } catch (IOException e) { e.printStackTrace(); return -1; } catch (InterruptedException e) { e.printStackTrace(); return -1; } catch (ClassNotFoundException e) { e.printStackTrace(); return -1; } return 0; }
From source file:com.soteradefense.dga.LouvainRunner.java
License:Apache License
private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception { Configuration mrConf = new Configuration(); for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); }//from w ww . j av a 2s . co m Job job = Job.getInstance(configuration); job.setJarByClass(LouvainRunner.class); Path in = new Path(inputPath); Path out = new Path(outputPath); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CommunityCompression"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LouvainVertexWritable.class); job.setMapperClass(CommunityCompression.Map.class); job.setReducerClass(CommunityCompression.Reduce.class); logger.debug("Running Mapreduce step with job configuration: {}", job); return job.waitForCompletion(false) ? 0 : 1; }
From source file:com.splicemachine.derby.stream.spark.SparkExportDataSetWriter.java
License:Apache License
@Override public DataSet<LocatedRow> write() throws StandardException { Configuration conf = new Configuration(HConfiguration.unwrapDelegate()); ByteDataOutput bdo = new ByteDataOutput(); Job job;//from ww w .j av a 2s .c o m String encoded; try { bdo.writeObject(exportFunction); encoded = Base64.encodeBase64String(bdo.toByteArray()); conf.set("exportFunction", encoded); job = Job.getInstance(conf); } catch (IOException e) { throw new RuntimeException(e); } job.setOutputKeyClass(Void.class); job.setOutputValueClass(LocatedRow.class); job.setOutputFormatClass(SparkDataSet.EOutputFormat.class); job.getConfiguration().set("mapred.output.dir", directory); JavaRDD<V> cached = rdd.cache(); long writtenRows = cached.count(); rdd.keyBy(new NullFunction<V>()).setName(String.format("Export Directory: %s", directory)) .saveAsNewAPIHadoopDataset(job.getConfiguration()); cached.unpersist(); ValueRow valueRow = new ValueRow(2); valueRow.setColumn(1, new SQLLongint(writtenRows)); valueRow.setColumn(2, new SQLInteger(0)); return new SparkDataSet<>( SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1)); }
From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java
License:Apache License
@Override public Job call() throws Exception { // We're explicitly disabling speculative execution conf.set("mapreduce.map.speculative", "false"); conf.set("mapreduce.map.maxattempts", "1"); conf.set("mapreduce.job.user.classpath.first", "true"); conf.set("mapreduce.task.classpath.user.precedence", "true"); conf.set("mapreduce.task.classpath.first", "true"); addNecessaryJarsToJob(conf);//from w w w .j a v a 2 s. c o m Job job = Job.getInstance(conf); // IO formats job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(NullOutputFormat.class); // Mapper & job output job.setMapperClass(getMapperClass()); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // It's map only job job.setNumReduceTasks(0); // General configuration job.setJarByClass(getClass()); return job; }
From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.SimpleJobCreator.java
License:Apache License
@Override public Job call() throws Exception { return Job.getInstance(conf); }
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounter.java
License:Apache License
/** * @param args//from w ww . j a v a2 s. com * @throws IOException * @throws Exception */ public static void main(String[] args) throws IOException, Exception { String inputPath = null; String outputPath = null; String master = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. for (int i = 0; i < args.length; i++) { try { if (args[i].equals(ARGNAME_INPATH)) { inputPath = args[++i]; } else if (args[i].equals(ARGNAME_OUTPATH)) { outputPath = args[++i]; } else if (args[i].equals(ARGNAME_MASTER)) { master = args[++i]; } else if (args[i].equals(ARGNAME_S3ACCESSKEY)) { s3AccessKey = args[++i]; } else if (args[i].equals(ARGNAME_S3SECRETKEY)) { s3SecretKey = args[++i]; } else if (args[i].equals(ARGNAME_MAXFILES)) { WarcFileFilter.setMax(Long.parseLong(args[++i])); } else if (args[i].equals(ARGNAME_OVERWRITE)) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[i]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } LOG.info(" inputPath :" + inputPath); if (inputPath == null || outputPath == null || master == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } SparkConf sparkConf = new SparkConf().setAppName("GoogleAdsCounter").setMaster(master); JavaSparkContext sc = new JavaSparkContext(sparkConf); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } //define the accumulators to count total response pages and total Google Ad Pages final Accumulator<Integer> totalResponsePagesAccumulator = sc.accumulator(0); final Accumulator<Integer> totalGoogleAdPagesAccumulator = sc.accumulator(0); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); JavaPairRDD<LongWritable, WARCWritable> records = sc.newAPIHadoopFile(inputPath, WARCInputFormat.class, LongWritable.class, WARCWritable.class, job.getConfiguration()); JavaPairRDD<String, Integer> warcRecords = records .mapToPair(new PairFunction<Tuple2<LongWritable, WARCWritable>, String, Integer>() { public Tuple2<String, Integer> call(Tuple2<LongWritable, WARCWritable> record) throws Exception { String recordType = record._2().getRecord().getHeader().getRecordType(); String adType = null; if (recordType.equals("response")) { totalResponsePagesAccumulator.add(1); // total response pages String recordContent = new String(record._2().getRecord().getContent()); // parse Html content of web page using Jsoup Document doc = Jsoup.parse(recordContent); // Get the <script> tag elements Elements scriptElements = doc.getElementsByTag("script"); for (Element element : scriptElements) { // if web page has google ads, then <script> tag contains "google_ad_client" if (element.data().contains("google_ad_client")) { totalGoogleAdPagesAccumulator.add(1); GoogleAdParser parser = new DefaultParser(element.data()); String siteUrl = record._2().getRecord().getHeader().getTargetURI(); String title = "Default"; // FIXME String adClient = parser.getAttribute("google_ad_client") != null ? parser.getAttribute("google_ad_client") : "NA"; String adSlot = "default"; // FIXME String width = parser.getAttribute("google_ad_width") != null ? parser.getAttribute("google_ad_width") : "NA"; String height = parser.getAttribute("google_ad_height") != null ? parser.getAttribute("google_ad_height") : "NA"; adType = parser.getAttribute("google_ad_type") != null ? parser.getAttribute("google_ad_type") : "text"; } } return new Tuple2<String, Integer>(adType, 1); } else return new Tuple2<String, Integer>(adType, 1); } }); JavaPairRDD<String, Integer> adTypeCounts = warcRecords .reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } long startTime = System.currentTimeMillis(); //writing output to file adTypeCounts.saveAsNewAPIHadoopFile(outputPath, org.apache.hadoop.io.Text.class, org.apache.hadoop.io.Text.class, org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); //print accumulator values LOG.info(" totalResponsePagesAccumulator value : " + totalResponsePagesAccumulator.value()); LOG.info(" totalGoogleAdPagesAccumulator value : " + totalGoogleAdPagesAccumulator.value()); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); //stop spark context sc.stop(); }
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
License:Apache License
/** * Configures and submits the Map Reduce Job to Hadoop *//* w w w .j av a2s .c om*/ public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java
License:Apache License
/** * Verify listing statuses without any configuration. *//* w w w. ja v a2 s .co m*/ @Test public void listStatus() throws IOException { // Create temp files final File file1 = tempFolder.newFile("file1"); Assert.assertTrue(file1.setLastModified(currentTimeMillis)); final File folder1 = tempFolder.newFolder("folder1"); final File file2 = tempFolder.newFile("folder1/file2"); Assert.assertTrue(file2.setLastModified(currentTimeMillis)); tempFolder.newFolder("folder1", "folder2"); final File file3 = tempFolder.newFile("folder1/folder2/file3"); Assert.assertTrue(file3.setLastModified(currentTimeMillis)); // Test listing files final Job job = Job.getInstance(new Configuration(false)); HighWaterMarkInputFormat.setInputPaths(job, file1.getAbsolutePath() + "," + folder1.getAbsolutePath()); final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat(); final List<FileStatus> files = inputFormat.listStatus(job); Collections.sort(files, new FileStatusComparator()); Assert.assertEquals(new Path(file1.toURI()), files.get(0).getPath()); Assert.assertEquals(new Path(file2.toURI()), files.get(1).getPath()); Assert.assertEquals(2, files.size()); Assert.assertEquals(currentTimeMillis, inputFormat.getLastHighWaterMark()); // Verify files are not listed again Assert.assertEquals(0, inputFormat.listStatus(job).size()); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java
License:Apache License
/** * Verify listing files with a high water mark. *//* ww w.ja v a 2s . c o m*/ @Test public void listStatusHighWaterMark() throws IOException { // Create temp file final File file1 = tempFolder.newFile("file1"); Assert.assertTrue(file1.setLastModified(currentTimeMillis)); final File file2 = tempFolder.newFile("file2"); Assert.assertTrue(file2.setLastModified(currentTimeMillis + 1000)); // Test listing files with high water mark final Job job = Job.getInstance(new Configuration(false)); HighWaterMarkInputFormat.setHighWaterMark(job, currentTimeMillis); HighWaterMarkInputFormat.setInputPaths(job, tempFolder.getRoot().getAbsolutePath()); final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat(); final List<FileStatus> files = inputFormat.listStatus(job); Collections.sort(files, new FileStatusComparator()); Assert.assertEquals(new Path(file2.toURI()), files.get(0).getPath()); Assert.assertEquals(1, files.size()); Assert.assertEquals(currentTimeMillis + 1000, inputFormat.getLastHighWaterMark()); }