Example usage for org.apache.hadoop.mapreduce Job getInstance

List of usage examples for org.apache.hadoop.mapreduce Job getInstance

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInstance.

Prototype

@Deprecated
public static Job getInstance(Cluster ignored) throws IOException 

Source Link

Document

Creates a new Job with no particular Cluster .

Usage

From source file:com.soteradefense.dga.louvain.mapreduce.CommunityCompression.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration mrConf = this.getConf();
    for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }//from w  ww. j  a va 2 s . c  o  m

    Job job = Job.getInstance(mrConf);
    job.setJarByClass(CommunityCompression.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.soteradefense.dga.louvain.mapreduce.LouvainTableSynthesizer.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = null;// ww  w. jav a 2 s.c o m
    try {
        int iteration = 0;
        if (!basePath.endsWith("/"))
            basePath = basePath + "/";
        String inputPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        String joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
        String outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        Configuration mrConf = this.getConf();
        job = Job.getInstance(mrConf);

        for (Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
            mrConf.set(entry.getKey(), entry.getValue());
        }

        FileSystem fs = FileSystem.get(job.getConfiguration());
        boolean nextFileExists = fs.exists(new Path(joinPath));
        while (nextFileExists) {
            System.out.println("Processing " + inputPath + " and " + joinPath);
            job = Job.getInstance(mrConf);
            job.setJobName("Louvain Table Synthesizer " + iteration);

            job.setJarByClass(LouvainTableSynthesizer.class);

            job.setMapperClass(LouvainTableSynthesizerMapper.class);
            job.setReducerClass(LouvainTableSynthesizerReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            //Reducer Output
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

            //Add both input folders
            Path in = new Path(inputPath);
            Path joinIn = new Path(joinPath);
            Path out = new Path(outputPath);
            FileInputFormat.addInputPath(job, in);
            FileInputFormat.addInputPath(job, joinIn);
            FileOutputFormat.setOutputPath(job, out);

            job.waitForCompletion(true);
            //Set the new temp input path
            inputPath = outputPath;
            iteration++;
            outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
            joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
            nextFileExists = fs.exists(new Path(joinPath));
        }

    } catch (IOException e) {
        e.printStackTrace();
        return -1;
    } catch (InterruptedException e) {
        e.printStackTrace();
        return -1;
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
        return -1;
    }
    return 0;
}

From source file:com.soteradefense.dga.LouvainRunner.java

License:Apache License

private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }//from  w ww  .  j  av a  2s  . co m

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.splicemachine.derby.stream.spark.SparkExportDataSetWriter.java

License:Apache License

@Override
public DataSet<LocatedRow> write() throws StandardException {
    Configuration conf = new Configuration(HConfiguration.unwrapDelegate());
    ByteDataOutput bdo = new ByteDataOutput();
    Job job;//from ww  w  .j  av  a  2s .c  o m
    String encoded;

    try {
        bdo.writeObject(exportFunction);
        encoded = Base64.encodeBase64String(bdo.toByteArray());
        conf.set("exportFunction", encoded);
        job = Job.getInstance(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    job.setOutputKeyClass(Void.class);
    job.setOutputValueClass(LocatedRow.class);
    job.setOutputFormatClass(SparkDataSet.EOutputFormat.class);
    job.getConfiguration().set("mapred.output.dir", directory);

    JavaRDD<V> cached = rdd.cache();
    long writtenRows = cached.count();
    rdd.keyBy(new NullFunction<V>()).setName(String.format("Export Directory: %s", directory))
            .saveAsNewAPIHadoopDataset(job.getConfiguration());
    cached.unpersist();

    ValueRow valueRow = new ValueRow(2);
    valueRow.setColumn(1, new SQLLongint(writtenRows));
    valueRow.setColumn(2, new SQLInteger(0));
    return new SparkDataSet<>(
            SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    // We're explicitly disabling speculative execution
    conf.set("mapreduce.map.speculative", "false");
    conf.set("mapreduce.map.maxattempts", "1");

    conf.set("mapreduce.job.user.classpath.first", "true");
    conf.set("mapreduce.task.classpath.user.precedence", "true");
    conf.set("mapreduce.task.classpath.first", "true");

    addNecessaryJarsToJob(conf);//from w w  w  .j  a  v a  2 s. c  o m

    Job job = Job.getInstance(conf);

    // IO formats
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(NullOutputFormat.class);

    // Mapper & job output
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // It's map only job
    job.setNumReduceTasks(0);

    // General configuration
    job.setJarByClass(getClass());

    return job;
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.SimpleJobCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    return Job.getInstance(conf);
}

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounter.java

License:Apache License

/**
 * @param args//from  w ww . j a  v  a2 s.  com
 * @throws IOException
 * @throws Exception
 */
public static void main(String[] args) throws IOException, Exception {

    String inputPath = null;
    String outputPath = null;
    String master = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments.
    for (int i = 0; i < args.length; i++) {
        try {
            if (args[i].equals(ARGNAME_INPATH)) {
                inputPath = args[++i];
            } else if (args[i].equals(ARGNAME_OUTPATH)) {
                outputPath = args[++i];
            } else if (args[i].equals(ARGNAME_MASTER)) {
                master = args[++i];
            } else if (args[i].equals(ARGNAME_S3ACCESSKEY)) {
                s3AccessKey = args[++i];
            } else if (args[i].equals(ARGNAME_S3SECRETKEY)) {
                s3SecretKey = args[++i];
            } else if (args[i].equals(ARGNAME_MAXFILES)) {
                WarcFileFilter.setMax(Long.parseLong(args[++i]));
            } else if (args[i].equals(ARGNAME_OVERWRITE)) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }
    LOG.info(" inputPath :" + inputPath);
    if (inputPath == null || outputPath == null || master == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    SparkConf sparkConf = new SparkConf().setAppName("GoogleAdsCounter").setMaster(master);

    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }

    //define the accumulators to count total response pages and total Google Ad Pages
    final Accumulator<Integer> totalResponsePagesAccumulator = sc.accumulator(0);
    final Accumulator<Integer> totalGoogleAdPagesAccumulator = sc.accumulator(0);

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    JavaPairRDD<LongWritable, WARCWritable> records = sc.newAPIHadoopFile(inputPath, WARCInputFormat.class,
            LongWritable.class, WARCWritable.class, job.getConfiguration());

    JavaPairRDD<String, Integer> warcRecords = records
            .mapToPair(new PairFunction<Tuple2<LongWritable, WARCWritable>, String, Integer>() {

                public Tuple2<String, Integer> call(Tuple2<LongWritable, WARCWritable> record)
                        throws Exception {

                    String recordType = record._2().getRecord().getHeader().getRecordType();

                    String adType = null;
                    if (recordType.equals("response")) {

                        totalResponsePagesAccumulator.add(1); // total response pages

                        String recordContent = new String(record._2().getRecord().getContent());

                        // parse Html content of web page using Jsoup
                        Document doc = Jsoup.parse(recordContent);

                        // Get the <script> tag elements 
                        Elements scriptElements = doc.getElementsByTag("script");

                        for (Element element : scriptElements) {

                            // if web page has google ads, then <script> tag contains "google_ad_client" 
                            if (element.data().contains("google_ad_client")) {

                                totalGoogleAdPagesAccumulator.add(1);

                                GoogleAdParser parser = new DefaultParser(element.data());

                                String siteUrl = record._2().getRecord().getHeader().getTargetURI();
                                String title = "Default"; // FIXME

                                String adClient = parser.getAttribute("google_ad_client") != null
                                        ? parser.getAttribute("google_ad_client")
                                        : "NA";
                                String adSlot = "default"; // FIXME
                                String width = parser.getAttribute("google_ad_width") != null
                                        ? parser.getAttribute("google_ad_width")
                                        : "NA";
                                String height = parser.getAttribute("google_ad_height") != null
                                        ? parser.getAttribute("google_ad_height")
                                        : "NA";
                                adType = parser.getAttribute("google_ad_type") != null
                                        ? parser.getAttribute("google_ad_type")
                                        : "text";
                            }
                        }
                        return new Tuple2<String, Integer>(adType, 1);
                    } else
                        return new Tuple2<String, Integer>(adType, 1);

                }
            });

    JavaPairRDD<String, Integer> adTypeCounts = warcRecords
            .reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer i1, Integer i2) {
                    return i1 + i2;
                }
            });

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    long startTime = System.currentTimeMillis();

    //writing output to file
    adTypeCounts.saveAsNewAPIHadoopFile(outputPath, org.apache.hadoop.io.Text.class,
            org.apache.hadoop.io.Text.class, org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);

    //print accumulator values      
    LOG.info(" totalResponsePagesAccumulator value : " + totalResponsePagesAccumulator.value());
    LOG.info(" totalGoogleAdPagesAccumulator value : " + totalGoogleAdPagesAccumulator.value());
    long endTime = System.currentTimeMillis();
    long difference = endTime - startTime;
    LOG.info("Elapsed milliseconds: " + difference);

    //stop spark context
    sc.stop();
}

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

/**
 * Configures and submits the Map Reduce Job to Hadoop
 *//*  w  w w .j  av  a2s  .c om*/
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
                WarcFileFilter.setMax(Long.parseLong(args[++index]));
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }

    if (inputPath == null || outputPath == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(GoogleAdsCounterJob.class);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    /*
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * 
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
     */
    // Set which InputFormat class to use.
    job.setInputFormatClass(WARCInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    /*
     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
     */
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(4);
    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(GoogleAdsCounterMapper.class);
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
    job.setReducerClass(GoogleAdsCounterReducer.class);

    // set combiner
    //job.setCombinerClass(GoogleAdsCounterReducer.class);

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;
    }
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java

License:Apache License

/**
 * Verify listing statuses without any configuration.
 *//*  w  w w. ja  v  a2  s .co m*/
@Test
public void listStatus() throws IOException {
    // Create temp files
    final File file1 = tempFolder.newFile("file1");
    Assert.assertTrue(file1.setLastModified(currentTimeMillis));

    final File folder1 = tempFolder.newFolder("folder1");
    final File file2 = tempFolder.newFile("folder1/file2");
    Assert.assertTrue(file2.setLastModified(currentTimeMillis));

    tempFolder.newFolder("folder1", "folder2");
    final File file3 = tempFolder.newFile("folder1/folder2/file3");
    Assert.assertTrue(file3.setLastModified(currentTimeMillis));

    // Test listing files
    final Job job = Job.getInstance(new Configuration(false));
    HighWaterMarkInputFormat.setInputPaths(job, file1.getAbsolutePath() + "," + folder1.getAbsolutePath());

    final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat();
    final List<FileStatus> files = inputFormat.listStatus(job);
    Collections.sort(files, new FileStatusComparator());
    Assert.assertEquals(new Path(file1.toURI()), files.get(0).getPath());
    Assert.assertEquals(new Path(file2.toURI()), files.get(1).getPath());
    Assert.assertEquals(2, files.size());
    Assert.assertEquals(currentTimeMillis, inputFormat.getLastHighWaterMark());

    // Verify files are not listed again
    Assert.assertEquals(0, inputFormat.listStatus(job).size());
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java

License:Apache License

/**
 * Verify listing files with a high water mark.
 *//*  ww w.ja v  a 2s .  c o  m*/
@Test
public void listStatusHighWaterMark() throws IOException {
    // Create temp file
    final File file1 = tempFolder.newFile("file1");
    Assert.assertTrue(file1.setLastModified(currentTimeMillis));

    final File file2 = tempFolder.newFile("file2");
    Assert.assertTrue(file2.setLastModified(currentTimeMillis + 1000));

    // Test listing files with high water mark
    final Job job = Job.getInstance(new Configuration(false));
    HighWaterMarkInputFormat.setHighWaterMark(job, currentTimeMillis);
    HighWaterMarkInputFormat.setInputPaths(job, tempFolder.getRoot().getAbsolutePath());

    final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat();
    final List<FileStatus> files = inputFormat.listStatus(job);
    Collections.sort(files, new FileStatusComparator());
    Assert.assertEquals(new Path(file2.toURI()), files.get(0).getPath());
    Assert.assertEquals(1, files.size());
    Assert.assertEquals(currentTimeMillis + 1000, inputFormat.getLastHighWaterMark());
}